Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fixed preprocessing and training scripts in Quick-start for Ranking #1017

Merged
merged 3 commits into from
Jun 20, 2023
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 17 additions & 13 deletions examples/quick_start/scripts/preproc/preprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -232,7 +232,7 @@ def generate_nvt_features(self):
feats[col] = feats[col] >> nvt_ops.FillMissing(
args.continuous_features_fillna
)
feats[col] = feats[col] >> nvt_ops.Normalize()
feats[col] = feats[col] >> nvt_ops.Normalize()

if args.target_encoding_features or args.target_encoding_targets:
if not args.target_encoding_features:
Expand All @@ -244,14 +244,13 @@ def generate_nvt_features(self):

if args.target_encoding_targets and args.target_encoding_features:
for target_col in args.target_encoding_targets:
feats[f"{target_col}_te_features"] = (
args.target_encoding_features
>> nvt.ops.TargetEncoding(
[target_col],
kfold=args.target_encoding_kfold,
p_smooth=args.target_encoding_smoothing,
out_dtype="float32",
)
feats[
f"{target_col}_te_features"
] = args.target_encoding_features >> nvt.ops.TargetEncoding(
[target_col],
kfold=args.target_encoding_kfold,
p_smooth=args.target_encoding_smoothing,
out_dtype="float32",
)

for col in args.user_features:
Expand Down Expand Up @@ -322,7 +321,9 @@ def merge_dataset_features_values(
).excluding_by_name([INDEX_TMP_COL])

dataset_joint = nvt.Dataset(
dataset_joint, schema=schema_joint, cpu=not self.gpu,
dataset_joint,
schema=schema_joint,
cpu=not self.gpu,
)

return dataset_joint
Expand Down Expand Up @@ -442,7 +443,8 @@ def run(self):
train_dataset_features, train_dataset_targets, "train", args
)
train_dataset_preproc.to_parquet(
output_train_dataset_path, output_files=args.output_num_partitions,
output_train_dataset_path,
output_files=args.output_num_partitions,
)

if args.eval_data_path or args.dataset_split_strategy:
Expand All @@ -459,7 +461,8 @@ def run(self):
eval_dataset_features, eval_dataset_targets, "eval", args
)
eval_dataset_preproc.to_parquet(
output_eval_dataset_path, output_files=args.output_num_partitions,
output_eval_dataset_path,
output_files=args.output_num_partitions,
)

if args.predict_data_path:
Expand All @@ -484,7 +487,8 @@ def run(self):
logging.info(f"Saving predict/test set: {output_predict_dataset_path}")

new_predict_dataset.to_parquet(
output_predict_dataset_path, output_files=args.output_num_partitions,
output_predict_dataset_path,
output_files=args.output_num_partitions,
)
nvt_save_path = os.path.join(output_dataset_path, "workflow")
logging.info(f"Saving nvtabular workflow to: {nvt_save_path}")
Expand Down
5 changes: 3 additions & 2 deletions examples/quick_start/scripts/ranking/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -106,9 +106,10 @@ This is an example command line for running the training for the TenRec dataset


```bash
cd /Merlin/examples/quick_start/scripts/ranking/
cd /Merlin/examples/
OUT_DATASET_PATH=/outputs/dataset
CUDA_VISIBLE_DEVICES=0 TF_GPU_ALLOCATOR=cuda_malloc_async python ranking.py --train_data_path $OUT_DATASET_PATH/train --eval_data_path $OUT_DATASET_PATH/eval --output_path ./outputs/ --tasks=click --stl_positive_class_weight 3 --model dlrm --embeddings_dim 64 --l2_reg 1e-4 --embeddings_l2_reg 1e-6 --dropout 0.05 --mlp_layers 64,32 --lr 1e-4 --lr_decay_rate 0.99 --lr_decay_steps 100 --train_batch_size 65536 --eval_batch_size 65536 --epochs 1 --save_model_path ./saved_model

CUDA_VISIBLE_DEVICES=0 TF_GPU_ALLOCATOR=cuda_malloc_async python -m quick_start.scripts.ranking.ranking --train_data_path $OUT_DATASET_PATH/train --eval_data_path $OUT_DATASET_PATH/eval --output_path ./outputs/ --tasks=click --stl_positive_class_weight 3 --model dlrm --embeddings_dim 64 --l2_reg 1e-4 --embeddings_l2_reg 1e-6 --dropout 0.05 --mlp_layers 64,32 --lr 1e-4 --lr_decay_rate 0.99 --lr_decay_steps 100 --train_batch_size 65536 --eval_batch_size 65536 --epochs 1 --save_model_path ./saved_model
```

### Inputs
Expand Down
29 changes: 19 additions & 10 deletions examples/quick_start/scripts/ranking/ranking_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,8 @@ def get_mlp_model(schema, args, prediction_tasks):
cat_schema,
embeddings_regularizer=regularizers.l2(args.embeddings_l2_reg),
infer_dim_fn=partial(
infer_embedding_dim, multiplier=args.embedding_sizes_multiplier,
infer_embedding_dim,
multiplier=args.embedding_sizes_multiplier,
),
),
aggregation="concat",
Expand Down Expand Up @@ -74,7 +75,8 @@ def get_dcn_model(schema, args, prediction_tasks):
schema.select_by_tag(Tags.CATEGORICAL),
embeddings_regularizer=regularizers.l2(args.embeddings_l2_reg),
infer_dim_fn=partial(
infer_embedding_dim, multiplier=args.embedding_sizes_multiplier,
infer_embedding_dim,
multiplier=args.embedding_sizes_multiplier,
),
),
aggregation="concat",
Expand Down Expand Up @@ -158,7 +160,7 @@ def get_deepfm_model(schema, args, prediction_tasks):
if len(cat_schema_multihot) > 0:
wide_inputs_block["categorical_mhe"] = mm.SequentialBlock(
mm.Filter(cat_schema_multihot),
mm.ListToDense(max_seq_length=args.multihot_max_seq_length),
mm.ToDense(cat_schema_multihot),
mm.CategoryEncoding(
cat_schema_multihot, sparse=True, output_mode="multi_hot"
),
Expand Down Expand Up @@ -195,7 +197,8 @@ def get_wide_and_deep_model(schema, args, prediction_tasks):
cat_schema,
embeddings_regularizer=regularizers.l2(args.embeddings_l2_reg),
infer_dim_fn=partial(
infer_embedding_dim, multiplier=args.embedding_sizes_multiplier,
infer_embedding_dim,
multiplier=args.embedding_sizes_multiplier,
),
)

Expand All @@ -212,7 +215,7 @@ def get_wide_and_deep_model(schema, args, prediction_tasks):
# 2nd level feature interactions of multi-hot features
mm.SequentialBlock(
mm.Filter(cat_schema.remove_by_tag(Tags.USER_ID)),
mm.ListToDense(max_seq_length=args.multihot_max_seq_length),
mm.ToDense(cat_schema.remove_by_tag(Tags.USER_ID)),
mm.HashedCrossAll(
cat_schema.remove_by_tag(Tags.USER_ID),
num_bins=args.wnd_hashed_cross_num_bins,
Expand All @@ -227,7 +230,7 @@ def get_wide_and_deep_model(schema, args, prediction_tasks):
wide_preprocess.append(
mm.SequentialBlock(
mm.Filter(cat_schema_multihot),
mm.ListToDense(max_seq_length=args.multihot_max_seq_length),
mm.ToDense(cat_schema_multihot),
mm.CategoryEncoding(
cat_schema_multihot, sparse=True, output_mode="multi_hot"
),
Expand All @@ -248,7 +251,10 @@ def get_wide_and_deep_model(schema, args, prediction_tasks):
wide_regularizer=regularizers.l2(args.wnd_wide_l2_reg),
wide_dropout=args.dropout,
deep_dropout=args.dropout,
wide_preprocess=mm.ParallelBlock(wide_preprocess, aggregation="concat",),
wide_preprocess=mm.ParallelBlock(
wide_preprocess,
aggregation="concat",
),
prediction_tasks=prediction_tasks,
)

Expand All @@ -273,7 +279,8 @@ def get_mmoe_model(schema, args, prediction_tasks):
schema.select_by_tag(Tags.CATEGORICAL),
embeddings_regularizer=regularizers.l2(args.embeddings_l2_reg),
infer_dim_fn=partial(
infer_embedding_dim, multiplier=args.embedding_sizes_multiplier,
infer_embedding_dim,
multiplier=args.embedding_sizes_multiplier,
),
),
aggregation="concat",
Expand Down Expand Up @@ -310,7 +317,8 @@ def get_cgc_model(schema, args, prediction_tasks):
schema.select_by_tag(Tags.CATEGORICAL),
embeddings_regularizer=regularizers.l2(args.embeddings_l2_reg),
infer_dim_fn=partial(
infer_embedding_dim, multiplier=args.embedding_sizes_multiplier,
infer_embedding_dim,
multiplier=args.embedding_sizes_multiplier,
),
),
aggregation="concat",
Expand Down Expand Up @@ -348,7 +356,8 @@ def get_ple_model(schema, args, prediction_tasks):
schema.select_by_tag(Tags.CATEGORICAL),
embeddings_regularizer=regularizers.l2(args.embeddings_l2_reg),
infer_dim_fn=partial(
infer_embedding_dim, multiplier=args.embedding_sizes_multiplier,
infer_embedding_dim,
multiplier=args.embedding_sizes_multiplier,
),
),
aggregation="concat",
Expand Down