From 7aa3b7abbeec469413f35299c0228a428c201e79 Mon Sep 17 00:00:00 2001 From: Wing Lian Date: Tue, 23 Jan 2024 23:38:10 -0500 Subject: [PATCH] report min lenght of tokenized data --- src/axolotl/utils/trainer.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/axolotl/utils/trainer.py b/src/axolotl/utils/trainer.py index 38b67fb43..26197b991 100644 --- a/src/axolotl/utils/trainer.py +++ b/src/axolotl/utils/trainer.py @@ -110,6 +110,8 @@ def process_datasets_for_packing(cfg, train_dataset, eval_dataset): drop_long = partial(drop_long_seq, sequence_len=cfg.sequence_len) with zero_first(is_main_process()): if cfg.is_preprocess: + min_input_len = np.min(get_dataset_lengths(train_dataset)) + LOG.debug(f"min_input_len: {min_input_len}", main_process_only=True) max_input_len = np.max(get_dataset_lengths(train_dataset)) LOG.debug(f"max_input_len: {max_input_len}", main_process_only=True)