Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

DCN 40M fp32 #367

Draft
wants to merge 2 commits into
base: main
Choose a base branch
from
Draft

DCN 40M fp32 #367

wants to merge 2 commits into from

Conversation

MARD1NO
Copy link
Contributor

@MARD1NO MARD1NO commented Jul 26, 2022


fp32: 
================ Test Evaluation ================
Rank[0], Epoch 7, Step 75000, AUC 0.802074, LogLoss 0.125882, Eval_time 19.48 s, Metrics_time 6.48 s, Eval_samples 89140000, GPU_Memory 15074 MiB, Host_Memory 10726 MiB, 2022-07-27 11:03:38

对应HugeCTR脚本:

import hugectr
from mpi4py import MPI

data_dir = "/RAID0/liujuncheng/criteo1t_parquet_40M_long"

print(f"{data_dir}/train/_file_list.txt")

solver = hugectr.CreateSolver(batchsize_eval = 55296,# real value
                              batchsize = 55296, # 55296 or 69120
                              lr = 0.0025, # 对齐
                              warmup_steps = 2750, 
                              decay_start = 40000, 
                              decay_steps = 40000, 
                              decay_power = 2.0,
                              end_lr = 1e-6,
                              enable_tf32_compute = True,
                              #use_mixed_precision = True,
                              #scaler = 1024,
                              vvgpu = [[0,1,2,3]], # 8 gpus
                              repeat_dataset = True,
                              use_algorithm_search=False,
                              i64_input_key = True) # in32, False

reader = hugectr.DataReaderParams(data_reader_type = hugectr.DataReaderType_t.Parquet,
                                  source = [f"{data_dir}/train/_file_list.txt"],
                                  eval_source = f"{data_dir}/test/_file_list.txt",
                                  slot_size_array = [62774, 8001, 2901, 74279, 7513, 3369, 1392, 21627, 7919, 21, 276, 1231236, 9643, 39873199, 38853, 17240, 7421, 20263, 3, 7103, 1540, 63, 38457188, 2929249, 400771, 10, 2209, 11910, 152, 4, 976, 14, 39976779, 25414584, 39639858, 583095, 12929, 108, 36],  # real value
                                  check_type = hugectr.Check_t.Non)
optimizer = hugectr.CreateOptimizer(optimizer_type = hugectr.Optimizer_t.Adam,
                                    update_type = hugectr.Update_t.Local, #有可能会影响性能
                                    beta1 = 0.9,
                                    beta2 = 0.999,
                                    epsilon = 1e-8)

dropout_rate = 0.05

model = hugectr.Model(solver, reader, optimizer)
model.add(hugectr.Input(label_dim = 1, label_name = "labels",
                        dense_dim = 0, 
                        dense_name = "dense",
                        data_reader_sparse_param_array = 
                        [hugectr.DataReaderSparseParam("data1", 2, False, 39)])) # 2 False 的含义

model.add(hugectr.SparseEmbedding(embedding_type = hugectr.Embedding_t.LocalizedSlotSparseEmbeddingHash, #有三种可以选 
                            workspace_size_per_gpu_in_mb = 15000,#bigger enough
                            embedding_vec_size = 16,
                            combiner = "sum",
                            sparse_embedding_name = "sparse_embedding1",
                            bottom_name = "data1",
                            slot_size_array = [62774, 8001, 2901, 74279, 7513, 3369, 1392, 21627, 7919, 21, 276, 1231236, 9643, 39873199, 38853, 17240, 7421, 20263, 3, 7103, 1540, 63, 38457188, 2929249, 400771, 10, 2209, 11910, 152, 4, 976, 14, 39976779, 25414584, 39639858, 583095, 12929, 108, 36], 
                            optimizer = optimizer))

model.add(hugectr.DenseLayer(layer_type = hugectr.Layer_t.Reshape,
                            bottom_names = ["sparse_embedding1"],
                            top_names = ["reshape_sparse_embedding"],
                            leading_dim=16 * 39))  

model.add(hugectr.DenseLayer(layer_type = hugectr.Layer_t.MultiCross,
                            bottom_names = ["reshape_sparse_embedding"],
                            top_names = ["multicross1"],
                            num_layers=4))

# layer1
model.add(hugectr.DenseLayer(layer_type = hugectr.Layer_t.InnerProduct,
                            bottom_names = ["reshape_sparse_embedding"],
                            top_names = ["fc1"],
                            num_output=1000))
model.add(hugectr.DenseLayer(layer_type = hugectr.Layer_t.ReLU,
                            bottom_names = ["fc1"],
                            top_names = ["relu1"]))
model.add(hugectr.DenseLayer(layer_type = hugectr.Layer_t.Dropout,
                            bottom_names = ["relu1"],
                            top_names = ["dropout1"],
                            dropout_rate=dropout_rate))

# layer2
model.add(hugectr.DenseLayer(layer_type = hugectr.Layer_t.InnerProduct,
                            bottom_names = ["dropout1"],
                            top_names = ["fc2"],
                            num_output=1000))
model.add(hugectr.DenseLayer(layer_type = hugectr.Layer_t.ReLU,
                            bottom_names = ["fc2"],
                            top_names = ["relu2"]))
model.add(hugectr.DenseLayer(layer_type = hugectr.Layer_t.Dropout,
                            bottom_names = ["relu2"],
                            top_names = ["dropout2"],
                            dropout_rate=dropout_rate))

# layer3
model.add(hugectr.DenseLayer(layer_type = hugectr.Layer_t.InnerProduct,
                            bottom_names = ["dropout2"],
                            top_names = ["fc3"],
                            num_output=1000))
model.add(hugectr.DenseLayer(layer_type = hugectr.Layer_t.ReLU,
                            bottom_names = ["fc3"],
                            top_names = ["relu3"]))
model.add(hugectr.DenseLayer(layer_type = hugectr.Layer_t.Dropout,
                            bottom_names = ["relu3"],
                            top_names = ["dropout3"],
                            dropout_rate=dropout_rate))

# layer4
model.add(hugectr.DenseLayer(layer_type = hugectr.Layer_t.InnerProduct,
                            bottom_names = ["dropout3"],
                            top_names = ["fc4"],
                            num_output=1000))
model.add(hugectr.DenseLayer(layer_type = hugectr.Layer_t.ReLU,
                            bottom_names = ["fc4"],
                            top_names = ["relu4"]))
model.add(hugectr.DenseLayer(layer_type = hugectr.Layer_t.Dropout,
                            bottom_names = ["relu4"],
                            top_names = ["dropout4"],
                            dropout_rate=dropout_rate))

# layer5
model.add(hugectr.DenseLayer(layer_type = hugectr.Layer_t.InnerProduct,
                            bottom_names = ["dropout4"],
                            top_names = ["fc5"],
                            num_output=1000))
model.add(hugectr.DenseLayer(layer_type = hugectr.Layer_t.ReLU,
                            bottom_names = ["fc5"],
                            top_names = ["relu5"]))
model.add(hugectr.DenseLayer(layer_type = hugectr.Layer_t.Dropout,
                            bottom_names = ["relu5"],
                            top_names = ["dropout5"],
                            dropout_rate=dropout_rate))

model.add(hugectr.DenseLayer(layer_type = hugectr.Layer_t.Concat,
                            bottom_names = ["dropout5", "multicross1"],
                            top_names = ["concat2"]))
model.add(hugectr.DenseLayer(layer_type = hugectr.Layer_t.InnerProduct,
                            bottom_names = ["concat2"],
                            top_names = ["fc6"],
                            num_output=1))
model.add(hugectr.DenseLayer(layer_type = hugectr.Layer_t.BinaryCrossEntropyLoss,
                            bottom_names = ["fc6", "labels"],
                            top_names = ["loss"]))
model.compile()
model.summary()
# model.fit(max_iter = 2300, display = 200, eval_interval = 1000, snapshot = 1000000, snapshot_prefix = "dcn")
model.fit(max_iter = 75000, display = 1000, eval_interval = 4999, snapshot = 1000000, snapshot_prefix = "dcn")

[HCTR][02:21:10.156][INFO][RK0][main]: Evaluation, AUC: 0.804863
[HCTR][02:21:10.156][INFO][RK0][main]: Eval Time for 100 iters: 3.16018s

@MARD1NO MARD1NO changed the title DCN 40M DCN 40M fp32 Jul 27, 2022
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

Successfully merging this pull request may close these issues.

1 participant