Skip to content

DeepFM 40M #366

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 2 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
76 changes: 39 additions & 37 deletions RecommenderSystems/deepfm/deepfm_train_eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -154,7 +154,8 @@ def __init__(
self.shard_count = shard_count
self.cur_shard = cur_shard

fields = ["Label"]
# fields = ["Label"]
fields = ["label"]
fields += [f"I{i+1}" for i in range(num_dense_fields)]
fields += [f"C{i+1}" for i in range(num_sparse_fields)]
self.fields = fields
Expand Down Expand Up @@ -345,7 +346,6 @@ def forward(self, x: flow.Tensor) -> flow.Tensor:
def interaction(embedded_x: flow.Tensor) -> flow.Tensor:
return flow._C.fused_dot_feature_interaction([embedded_x], pooling="sum")


class DeepFMModule(nn.Module):
def __init__(
self,
Expand Down Expand Up @@ -385,7 +385,7 @@ def forward(self, inputs) -> flow.Tensor:
multi_embedded_x = self.embedding_layer(inputs)
embedded_x = multi_embedded_x[:, :, 0 : self.embedding_vec_size]
lr_embedded_x = multi_embedded_x[:, :, -1]

# FM
lr_out = flow.sum(lr_embedded_x, dim=1, keepdim=True)
dot_sum = interaction(embedded_x)
Expand Down Expand Up @@ -446,17 +446,19 @@ def build(self, labels, features):


def make_lr_scheduler(args, optimizer):
batches_per_epoch = math.ceil(args.num_train_samples / args.batch_size)
milestones = [
batches_per_epoch * (i + 1)
for i in range(math.floor(math.log(args.min_lr / args.learning_rate, args.lr_factor)))
]
multistep_lr = flow.optim.lr_scheduler.MultiStepLR(
optimizer=optimizer, milestones=milestones, gamma=args.lr_factor,
warmup_lr = flow.optim.lr_scheduler.LinearLR(
optimizer, start_factor=0, total_iters=3000,
)

return multistep_lr

poly_decay_lr = flow.optim.lr_scheduler.PolynomialLR(
optimizer, decay_batch=60000, end_learning_rate=1e-8, power=2.0, cycle=False,
)
sequential_lr = flow.optim.lr_scheduler.SequentialLR(
optimizer=optimizer,
schedulers=[warmup_lr, poly_decay_lr],
milestones=[10000],
interval_rescaling=True,
)
return sequential_lr

def get_metrics(logs):
kv = {"auc": 1, "logloss": -1}
Expand Down Expand Up @@ -517,7 +519,7 @@ def save_model(subdir):
save_model("initial_checkpoint")

# TODO: clip gradient norm
opt = flow.optim.Adam(deepfm_module.parameters(), lr=args.learning_rate)
opt = flow.optim.Adam(deepfm_module.parameters(), lr=args.learning_rate, eps=1e-7)
lr_scheduler = make_lr_scheduler(args, opt)
loss = flow.nn.BCEWithLogitsLoss(reduction="mean").to("cuda")

Expand Down Expand Up @@ -563,37 +565,37 @@ def save_model(subdir):
+ f"Latency {(latency * 1000):0.3f} ms, Throughput {throughput:0.1f}, {strtime}"
)

if step % batches_per_epoch == 0:
if step % 10000 == 0:
epoch += 1
auc, logloss = eval(
args,
eval_graph,
tag="val",
cur_step=step,
epoch=epoch,
cached_eval_batches=cached_eval_batches,
)
if args.save_model_after_each_eval:
save_model(f"step_{step}_val_auc_{auc:0.5f}")

monitor_value = get_metrics(logs={"auc": auc, "logloss": logloss})

stop_training, best_metric, stopping_steps, save_best = early_stop(
epoch,
monitor_value,
best_metric=best_metric,
stopping_steps=stopping_steps,
patience=args.patience,
min_delta=args.min_delta,
cached_eval_batches=None,
)

if args.save_best_model and save_best:
if rank == 0:
print(f"Save best model: monitor(max): {best_metric:.6f}")
save_model("best_checkpoint")

if not args.disable_early_stop and stop_training:
break
# if args.save_model_after_each_eval:
# save_model(f"step_{step}_val_auc_{auc:0.5f}")

# monitor_value = get_metrics(logs={"auc": auc, "logloss": logloss})

# stop_training, best_metric, stopping_steps, save_best = early_stop(
# epoch,
# monitor_value,
# best_metric=best_metric,
# stopping_steps=stopping_steps,
# patience=args.patience,
# min_delta=args.min_delta,
# )

# if args.save_best_model and save_best:
# if rank == 0:
# print(f"Save best model: monitor(max): {best_metric:.6f}")
# save_model("best_checkpoint")

# if not args.disable_early_stop and stop_training:
# break

deepfm_module.train()
last_time = time.time()
Expand Down
36 changes: 21 additions & 15 deletions RecommenderSystems/deepfm/train_deepfm.sh
Original file line number Diff line number Diff line change
@@ -1,8 +1,14 @@
#!/bin/bash
DEVICE_NUM_PER_NODE=1
DATA_DIR=/path/to/deepfm_parquet
PERSISTENT_PATH=/path/to/persistent
MODEL_SAVE_DIR=/path/to/model/save/dir
DEVICE_NUM_PER_NODE=4
DATA_DIR=/RAID0/xiexuan/criteo1t_parquet_40M_long
PERSISTENT_PATH=/home/zhengzekang/models_dcn/RecommenderSystems/dcn/init_model
MODEL_SAVE_DIR=/home/zhengzekang/models_dcn/RecommenderSystems/dcn/dcn_model

rm -rf /home/zhengzekang/models_dcn/RecommenderSystems/dcn/init_model/0-4/*
rm -rf /home/zhengzekang/models_dcn/RecommenderSystems/dcn/init_model/1-4/*
rm -rf /home/zhengzekang/models_dcn/RecommenderSystems/dcn/init_model/2-4/*
rm -rf /home/zhengzekang/models_dcn/RecommenderSystems/dcn/init_model/3-4/*


python3 -m oneflow.distributed.launch \
--nproc_per_node $DEVICE_NUM_PER_NODE \
Expand All @@ -12,18 +18,18 @@ python3 -m oneflow.distributed.launch \
deepfm_train_eval.py \
--data_dir $DATA_DIR \
--persistent_path $PERSISTENT_PATH \
--table_size_array "649,9364,14746,490,476707,11618,4142,1373,7275,13,169,407,1376,1460,583,10131227,2202608,305,24,12517,633,3,93145,5683,8351593,3194,27,14992,5461306,10,5652,2173,4,7046547,18,15,286181,105,142572" \
--store_type 'cached_host_mem' \
--cache_memory_budget_mb 1024 \
--batch_size 10000 \
--table_size_array "62866,8001,2901,74623,7530,3391,1400,21705,7937,21,276,1235896,9659,39884301,39040,17291,7421,20263,3,7121,1543,63,38532372,2953790,403302,10,2209,11938,155,4,976,14,39979538,25638302,39665755,585840,12973,108,36" \
--store_type 'device_mem' \
--batch_size 55296 \
--train_batches 75000 \
--loss_print_interval 100 \
--loss_print_interval 1000 \
--dnn "1000,1000,1000,1000,1000" \
--net_dropout 0.2 \
--learning_rate 0.001 \
--net_dropout 0.05 \
--learning_rate 0.0025 \
--embedding_vec_size 16 \
--num_train_samples 36672493 \
--num_val_samples 4584062 \
--num_test_samples 4584062 \
--num_train_samples 4195197692 \
--num_test_samples 89137319 \
--num_val_samples 89137318 \
--model_save_dir $MODEL_SAVE_DIR \
--save_best_model
--save_best_model \
--amp