From e406f427509793eda1d59dfb6767006f153a5dfe Mon Sep 17 00:00:00 2001 From: Vincent Chen Date: Thu, 23 May 2024 15:36:39 -0700 Subject: [PATCH 01/14] move txt log --- llmfoundry/utils/config_utils.py | 4 ++-- scripts/train/train.py | 2 ++ 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/llmfoundry/utils/config_utils.py b/llmfoundry/utils/config_utils.py index 72ca19834b..6fd4f88bc5 100644 --- a/llmfoundry/utils/config_utils.py +++ b/llmfoundry/utils/config_utils.py @@ -39,6 +39,7 @@ 'update_batch_size_info', 'process_init_device', 'log_config', + 'log_dataset_uri' ] @@ -508,7 +509,6 @@ def log_config(cfg: Dict[str, Any]) -> None: if 'mlflow' in loggers and mlflow.active_run(): mlflow.log_params(params=cfg) - _log_dataset_uri(cfg) def _parse_source_dataset(cfg: Dict[str, Any]) -> List[Tuple[str, str, str]]: @@ -619,7 +619,7 @@ def _process_data_source( log.warning('DataSource Not Found.') -def _log_dataset_uri(cfg: Dict[str, Any]) -> None: +def log_dataset_uri(cfg: Dict[str, Any]) -> None: """Logs dataset tracking information to MLflow. Args: diff --git a/scripts/train/train.py b/scripts/train/train.py index e0c2b8a94f..819584aaa1 100644 --- a/scripts/train/train.py +++ b/scripts/train/train.py @@ -55,6 +55,7 @@ TRAIN_CONFIG_KEYS, TrainConfig, log_config, + log_log_dataset_uri, make_dataclass_and_log_config, pop_config, process_init_device, @@ -530,6 +531,7 @@ def main(cfg: DictConfig) -> Trainer: if train_cfg.log_config: log.info('Logging config') log_config(logged_cfg) + log_dataset_uri(train_cfg) torch.cuda.empty_cache() gc.collect() From f298c7b5cdff96fe0cb51cf0e6890590d95445b7 Mon Sep 17 00:00:00 2001 From: Vincent Chen Date: Thu, 23 May 2024 15:52:36 -0700 Subject: [PATCH 02/14] typo --- scripts/train/train.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/train/train.py b/scripts/train/train.py index 819584aaa1..f06d39acd3 100644 --- a/scripts/train/train.py +++ b/scripts/train/train.py @@ -55,7 +55,7 @@ TRAIN_CONFIG_KEYS, TrainConfig, log_config, - log_log_dataset_uri, + log_dataset_uri, make_dataclass_and_log_config, pop_config, process_init_device, From 3d0e578b6cf0813cb8784edfdb3df7bcdaf4d98f Mon Sep 17 00:00:00 2001 From: Daniel King <43149077+dakinggg@users.noreply.github.com> Date: Fri, 24 May 2024 03:29:22 -0400 Subject: [PATCH 03/14] Update scripts/train/train.py --- scripts/train/train.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/train/train.py b/scripts/train/train.py index f06d39acd3..bfeec14e0b 100644 --- a/scripts/train/train.py +++ b/scripts/train/train.py @@ -531,7 +531,7 @@ def main(cfg: DictConfig) -> Trainer: if train_cfg.log_config: log.info('Logging config') log_config(logged_cfg) - log_dataset_uri(train_cfg) + log_dataset_uri(logged_cfg) torch.cuda.empty_cache() gc.collect() From 0dac0c56601a373e43df8674d4d1edbf03f52d07 Mon Sep 17 00:00:00 2001 From: Vincent Chen Date: Fri, 24 May 2024 11:09:56 -0700 Subject: [PATCH 04/14] train config --- llmfoundry/utils/config_utils.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/llmfoundry/utils/config_utils.py b/llmfoundry/utils/config_utils.py index 6fd4f88bc5..289fdfd5bf 100644 --- a/llmfoundry/utils/config_utils.py +++ b/llmfoundry/utils/config_utils.py @@ -511,7 +511,7 @@ def log_config(cfg: Dict[str, Any]) -> None: mlflow.log_params(params=cfg) -def _parse_source_dataset(cfg: Dict[str, Any]) -> List[Tuple[str, str, str]]: +def _parse_source_dataset(cfg: TrainConfig) -> List[Tuple[str, str, str]]: """Parse a run config for dataset information. Given a config dictionary, parse through it to determine what the datasource @@ -527,9 +527,9 @@ def _parse_source_dataset(cfg: Dict[str, Any]) -> List[Tuple[str, str, str]]: data_paths = [] # Handle train loader if it exists - train_dataset: Dict = cfg.get('train_loader', {}).get('dataset', {}) + train_dataset: Dict = cfg.train_loader.get('dataset', {}) train_split = train_dataset.get('split', None) - train_source_path = cfg.get('source_dataset_train', None) + train_source_path = cfg.source_dataset_train _process_data_source( train_source_path, train_dataset, @@ -539,7 +539,7 @@ def _parse_source_dataset(cfg: Dict[str, Any]) -> List[Tuple[str, str, str]]: ) # Handle eval_loader which might be a list or a single dictionary - eval_data_loaders = cfg.get('eval_loader', {}) + eval_data_loaders = cfg.eval_loader if not isinstance(eval_data_loaders, list): eval_data_loaders = [ eval_data_loaders, @@ -549,7 +549,7 @@ def _parse_source_dataset(cfg: Dict[str, Any]) -> List[Tuple[str, str, str]]: assert isinstance(eval_data_loader, dict) # pyright type check eval_dataset: Dict = eval_data_loader.get('dataset', {}) eval_split = eval_dataset.get('split', None) - eval_source_path = cfg.get('source_dataset_eval', None) + eval_source_path = cfg.source_dataset_eval _process_data_source( eval_source_path, eval_dataset, From ab979720b4922a031247630286190d24ddb46fbc Mon Sep 17 00:00:00 2001 From: Vincent Chen Date: Fri, 24 May 2024 11:20:03 -0700 Subject: [PATCH 05/14] debug --- llmfoundry/utils/config_utils.py | 2 +- scripts/train/train.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/llmfoundry/utils/config_utils.py b/llmfoundry/utils/config_utils.py index 289fdfd5bf..472b84fd9c 100644 --- a/llmfoundry/utils/config_utils.py +++ b/llmfoundry/utils/config_utils.py @@ -527,7 +527,7 @@ def _parse_source_dataset(cfg: TrainConfig) -> List[Tuple[str, str, str]]: data_paths = [] # Handle train loader if it exists - train_dataset: Dict = cfg.train_loader.get('dataset', {}) + train_dataset: TrainConfig = cfg.train_loader.get('dataset', {}) train_split = train_dataset.get('split', None) train_source_path = cfg.source_dataset_train _process_data_source( diff --git a/scripts/train/train.py b/scripts/train/train.py index bfeec14e0b..f06d39acd3 100644 --- a/scripts/train/train.py +++ b/scripts/train/train.py @@ -531,7 +531,7 @@ def main(cfg: DictConfig) -> Trainer: if train_cfg.log_config: log.info('Logging config') log_config(logged_cfg) - log_dataset_uri(logged_cfg) + log_dataset_uri(train_cfg) torch.cuda.empty_cache() gc.collect() From 87f873b786171e292e1bbefaed361672426e839c Mon Sep 17 00:00:00 2001 From: Vincent Chen Date: Fri, 24 May 2024 11:41:05 -0700 Subject: [PATCH 06/14] source data --- llmfoundry/utils/config_utils.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/llmfoundry/utils/config_utils.py b/llmfoundry/utils/config_utils.py index 472b84fd9c..7000e04d61 100644 --- a/llmfoundry/utils/config_utils.py +++ b/llmfoundry/utils/config_utils.py @@ -177,6 +177,10 @@ class TrainConfig: # Profiling profiler: Optional[Dict[str, Any]] = None + # MLFlow datasets + source_dataset_train: Optional[Dict[str, Any]] = None + source_dataset_eval: Optional[Dict[str, Any]] = None + # Variables to ignore variables: Optional[Dict[str, Any]] = None From 446a12ee2f3d123c52ff61eb38d78594b1aa71bf Mon Sep 17 00:00:00 2001 From: Vincent Chen Date: Fri, 24 May 2024 11:41:54 -0700 Subject: [PATCH 07/14] verbose --- llmfoundry/utils/config_utils.py | 1 + 1 file changed, 1 insertion(+) diff --git a/llmfoundry/utils/config_utils.py b/llmfoundry/utils/config_utils.py index 7000e04d61..adc90bbed3 100644 --- a/llmfoundry/utils/config_utils.py +++ b/llmfoundry/utils/config_utils.py @@ -534,6 +534,7 @@ def _parse_source_dataset(cfg: TrainConfig) -> List[Tuple[str, str, str]]: train_dataset: TrainConfig = cfg.train_loader.get('dataset', {}) train_split = train_dataset.get('split', None) train_source_path = cfg.source_dataset_train + print(f'---- VERBOSE {train_source_path}') _process_data_source( train_source_path, train_dataset, From b639f28a7ef226cc49835f094bb052f38783b8f4 Mon Sep 17 00:00:00 2001 From: Vincent Chen Date: Fri, 24 May 2024 11:55:16 -0700 Subject: [PATCH 08/14] debug --- llmfoundry/utils/config_utils.py | 1 + 1 file changed, 1 insertion(+) diff --git a/llmfoundry/utils/config_utils.py b/llmfoundry/utils/config_utils.py index adc90bbed3..f74b31f015 100644 --- a/llmfoundry/utils/config_utils.py +++ b/llmfoundry/utils/config_utils.py @@ -551,6 +551,7 @@ def _parse_source_dataset(cfg: TrainConfig) -> List[Tuple[str, str, str]]: ] # Normalize to list if it's a single dictionary for eval_data_loader in eval_data_loaders: + print(f'---- DEBUG {eval_data_loader}, {type(eval_data_loader)}') assert isinstance(eval_data_loader, dict) # pyright type check eval_dataset: Dict = eval_data_loader.get('dataset', {}) eval_split = eval_dataset.get('split', None) From 50149de13315dced51999e9265bd6a9386602428 Mon Sep 17 00:00:00 2001 From: Vincent Chen Date: Fri, 24 May 2024 12:03:10 -0700 Subject: [PATCH 09/14] debug --- llmfoundry/utils/config_utils.py | 39 ++++++++++++++++---------------- 1 file changed, 20 insertions(+), 19 deletions(-) diff --git a/llmfoundry/utils/config_utils.py b/llmfoundry/utils/config_utils.py index f74b31f015..ce4db5e7ec 100644 --- a/llmfoundry/utils/config_utils.py +++ b/llmfoundry/utils/config_utils.py @@ -534,7 +534,7 @@ def _parse_source_dataset(cfg: TrainConfig) -> List[Tuple[str, str, str]]: train_dataset: TrainConfig = cfg.train_loader.get('dataset', {}) train_split = train_dataset.get('split', None) train_source_path = cfg.source_dataset_train - print(f'---- VERBOSE {train_source_path}') + print(f'---- VERBOSE {cfg}') _process_data_source( train_source_path, train_dataset, @@ -545,24 +545,25 @@ def _parse_source_dataset(cfg: TrainConfig) -> List[Tuple[str, str, str]]: # Handle eval_loader which might be a list or a single dictionary eval_data_loaders = cfg.eval_loader - if not isinstance(eval_data_loaders, list): - eval_data_loaders = [ - eval_data_loaders, - ] # Normalize to list if it's a single dictionary - - for eval_data_loader in eval_data_loaders: - print(f'---- DEBUG {eval_data_loader}, {type(eval_data_loader)}') - assert isinstance(eval_data_loader, dict) # pyright type check - eval_dataset: Dict = eval_data_loader.get('dataset', {}) - eval_split = eval_dataset.get('split', None) - eval_source_path = cfg.source_dataset_eval - _process_data_source( - eval_source_path, - eval_dataset, - eval_split, - 'eval', - data_paths, - ) + if eval_data_loaders: + if not isinstance(eval_data_loaders, list): + eval_data_loaders = [ + eval_data_loaders, + ] # Normalize to list if it's a single dictionary + + for eval_data_loader in eval_data_loaders: + print(f'---- DEBUG {eval_data_loader}, {type(eval_data_loader)}') + assert isinstance(eval_data_loader, dict) # pyright type check + eval_dataset: Dict = eval_data_loader.get('dataset', {}) + eval_split = eval_dataset.get('split', None) + eval_source_path = cfg.source_dataset_eval + _process_data_source( + eval_source_path, + eval_dataset, + eval_split, + 'eval', + data_paths, + ) return data_paths From e525791ead474228d836501958f66be09e2ee7a2 Mon Sep 17 00:00:00 2001 From: Vincent Chen Date: Fri, 24 May 2024 12:45:06 -0700 Subject: [PATCH 10/14] check if mlflow is active --- llmfoundry/utils/config_utils.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/llmfoundry/utils/config_utils.py b/llmfoundry/utils/config_utils.py index ce4db5e7ec..91becaf17f 100644 --- a/llmfoundry/utils/config_utils.py +++ b/llmfoundry/utils/config_utils.py @@ -534,7 +534,6 @@ def _parse_source_dataset(cfg: TrainConfig) -> List[Tuple[str, str, str]]: train_dataset: TrainConfig = cfg.train_loader.get('dataset', {}) train_split = train_dataset.get('split', None) train_source_path = cfg.source_dataset_train - print(f'---- VERBOSE {cfg}') _process_data_source( train_source_path, train_dataset, @@ -632,6 +631,9 @@ def log_dataset_uri(cfg: Dict[str, Any]) -> None: Args: cfg (DictConfig): A config dictionary of a run """ + loggers = cfg.get('loggers', None) or {} + if 'mlflow' not in loggers or not mlflow.active_run(): + return # Figure out which data source to use data_paths = _parse_source_dataset(cfg) From 616baf69e3ddea1b96fb589a6c3e62ee8da48b93 Mon Sep 17 00:00:00 2001 From: Vincent Chen Date: Fri, 24 May 2024 12:54:36 -0700 Subject: [PATCH 11/14] fex tests --- llmfoundry/utils/config_utils.py | 2 +- tests/utils/test_mlflow_logging.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/llmfoundry/utils/config_utils.py b/llmfoundry/utils/config_utils.py index 91becaf17f..c5351a9e11 100644 --- a/llmfoundry/utils/config_utils.py +++ b/llmfoundry/utils/config_utils.py @@ -39,7 +39,7 @@ 'update_batch_size_info', 'process_init_device', 'log_config', - 'log_dataset_uri' + 'log_dataset_uri', ] diff --git a/tests/utils/test_mlflow_logging.py b/tests/utils/test_mlflow_logging.py index 04a600d44c..120725a84a 100644 --- a/tests/utils/test_mlflow_logging.py +++ b/tests/utils/test_mlflow_logging.py @@ -7,7 +7,7 @@ import pytest from llmfoundry.utils.config_utils import ( - _log_dataset_uri, + log_dataset_uri, _parse_source_dataset, ) @@ -87,7 +87,7 @@ def test_log_dataset_uri(): ) with patch('mlflow.log_input') as mock_log_input: - _log_dataset_uri(cfg) + log_dataset_uri(cfg) assert mock_log_input.call_count == 2 meta_dataset_calls = [ args[0] for args, _ in mock_log_input.call_args_list From bf7068a7d0a237ce62f4f7d28a9b2923c86b926c Mon Sep 17 00:00:00 2001 From: Vincent Chen Date: Fri, 24 May 2024 13:31:47 -0700 Subject: [PATCH 12/14] move mlflow check to train --- llmfoundry/utils/config_utils.py | 53 +++++++++++++------------------- scripts/train/train.py | 4 ++- 2 files changed, 25 insertions(+), 32 deletions(-) diff --git a/llmfoundry/utils/config_utils.py b/llmfoundry/utils/config_utils.py index c5351a9e11..6fd4f88bc5 100644 --- a/llmfoundry/utils/config_utils.py +++ b/llmfoundry/utils/config_utils.py @@ -39,7 +39,7 @@ 'update_batch_size_info', 'process_init_device', 'log_config', - 'log_dataset_uri', + 'log_dataset_uri' ] @@ -177,10 +177,6 @@ class TrainConfig: # Profiling profiler: Optional[Dict[str, Any]] = None - # MLFlow datasets - source_dataset_train: Optional[Dict[str, Any]] = None - source_dataset_eval: Optional[Dict[str, Any]] = None - # Variables to ignore variables: Optional[Dict[str, Any]] = None @@ -515,7 +511,7 @@ def log_config(cfg: Dict[str, Any]) -> None: mlflow.log_params(params=cfg) -def _parse_source_dataset(cfg: TrainConfig) -> List[Tuple[str, str, str]]: +def _parse_source_dataset(cfg: Dict[str, Any]) -> List[Tuple[str, str, str]]: """Parse a run config for dataset information. Given a config dictionary, parse through it to determine what the datasource @@ -531,9 +527,9 @@ def _parse_source_dataset(cfg: TrainConfig) -> List[Tuple[str, str, str]]: data_paths = [] # Handle train loader if it exists - train_dataset: TrainConfig = cfg.train_loader.get('dataset', {}) + train_dataset: Dict = cfg.get('train_loader', {}).get('dataset', {}) train_split = train_dataset.get('split', None) - train_source_path = cfg.source_dataset_train + train_source_path = cfg.get('source_dataset_train', None) _process_data_source( train_source_path, train_dataset, @@ -543,26 +539,24 @@ def _parse_source_dataset(cfg: TrainConfig) -> List[Tuple[str, str, str]]: ) # Handle eval_loader which might be a list or a single dictionary - eval_data_loaders = cfg.eval_loader - if eval_data_loaders: - if not isinstance(eval_data_loaders, list): - eval_data_loaders = [ - eval_data_loaders, - ] # Normalize to list if it's a single dictionary - - for eval_data_loader in eval_data_loaders: - print(f'---- DEBUG {eval_data_loader}, {type(eval_data_loader)}') - assert isinstance(eval_data_loader, dict) # pyright type check - eval_dataset: Dict = eval_data_loader.get('dataset', {}) - eval_split = eval_dataset.get('split', None) - eval_source_path = cfg.source_dataset_eval - _process_data_source( - eval_source_path, - eval_dataset, - eval_split, - 'eval', - data_paths, - ) + eval_data_loaders = cfg.get('eval_loader', {}) + if not isinstance(eval_data_loaders, list): + eval_data_loaders = [ + eval_data_loaders, + ] # Normalize to list if it's a single dictionary + + for eval_data_loader in eval_data_loaders: + assert isinstance(eval_data_loader, dict) # pyright type check + eval_dataset: Dict = eval_data_loader.get('dataset', {}) + eval_split = eval_dataset.get('split', None) + eval_source_path = cfg.get('source_dataset_eval', None) + _process_data_source( + eval_source_path, + eval_dataset, + eval_split, + 'eval', + data_paths, + ) return data_paths @@ -631,9 +625,6 @@ def log_dataset_uri(cfg: Dict[str, Any]) -> None: Args: cfg (DictConfig): A config dictionary of a run """ - loggers = cfg.get('loggers', None) or {} - if 'mlflow' not in loggers or not mlflow.active_run(): - return # Figure out which data source to use data_paths = _parse_source_dataset(cfg) diff --git a/scripts/train/train.py b/scripts/train/train.py index f06d39acd3..7e540b7896 100644 --- a/scripts/train/train.py +++ b/scripts/train/train.py @@ -531,7 +531,9 @@ def main(cfg: DictConfig) -> Trainer: if train_cfg.log_config: log.info('Logging config') log_config(logged_cfg) - log_dataset_uri(train_cfg) + loggers = logged_cfg.get('loggers', None) or {} + if 'mlflow' in loggers and mlflow.active_run(): + log_dataset_uri(logged_cfg) torch.cuda.empty_cache() gc.collect() From f15fc51e2b3c9d8142a9f19c5bd6be82f3c90a23 Mon Sep 17 00:00:00 2001 From: Vincent Chen Date: Fri, 24 May 2024 13:43:37 -0700 Subject: [PATCH 13/14] update test --- llmfoundry/utils/config_utils.py | 3 +++ scripts/train/train.py | 4 +--- tests/utils/test_mlflow_logging.py | 4 +++- 3 files changed, 7 insertions(+), 4 deletions(-) diff --git a/llmfoundry/utils/config_utils.py b/llmfoundry/utils/config_utils.py index 6fd4f88bc5..7e8e9b86c5 100644 --- a/llmfoundry/utils/config_utils.py +++ b/llmfoundry/utils/config_utils.py @@ -625,6 +625,9 @@ def log_dataset_uri(cfg: Dict[str, Any]) -> None: Args: cfg (DictConfig): A config dictionary of a run """ + loggers = cfg.get('loggers', None) or {} + if 'mlflow' not in loggers or not mlflow.active_run(): + return # Figure out which data source to use data_paths = _parse_source_dataset(cfg) diff --git a/scripts/train/train.py b/scripts/train/train.py index 7e540b7896..bfeec14e0b 100644 --- a/scripts/train/train.py +++ b/scripts/train/train.py @@ -531,9 +531,7 @@ def main(cfg: DictConfig) -> Trainer: if train_cfg.log_config: log.info('Logging config') log_config(logged_cfg) - loggers = logged_cfg.get('loggers', None) or {} - if 'mlflow' in loggers and mlflow.active_run(): - log_dataset_uri(logged_cfg) + log_dataset_uri(logged_cfg) torch.cuda.empty_cache() gc.collect() diff --git a/tests/utils/test_mlflow_logging.py b/tests/utils/test_mlflow_logging.py index 120725a84a..b55645b64a 100644 --- a/tests/utils/test_mlflow_logging.py +++ b/tests/utils/test_mlflow_logging.py @@ -84,9 +84,11 @@ def test_log_dataset_uri(): }}, source_dataset_train='huggingface/train_dataset', source_dataset_eval='huggingface/eval_dataset', + loggers={'mlflow': {}} ) - with patch('mlflow.log_input') as mock_log_input: + with patch('mlflow.log_input') as mock_log_input, \ + patch('mlflow.active_run', return_value=True): log_dataset_uri(cfg) assert mock_log_input.call_count == 2 meta_dataset_calls = [ From bed0976916d69bba63621a0d4b2a3db348c9362a Mon Sep 17 00:00:00 2001 From: Vincent Chen Date: Fri, 24 May 2024 13:52:20 -0700 Subject: [PATCH 14/14] precommit --- llmfoundry/utils/config_utils.py | 2 +- tests/utils/test_mlflow_logging.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/llmfoundry/utils/config_utils.py b/llmfoundry/utils/config_utils.py index 7e8e9b86c5..b6a5acf6d9 100644 --- a/llmfoundry/utils/config_utils.py +++ b/llmfoundry/utils/config_utils.py @@ -39,7 +39,7 @@ 'update_batch_size_info', 'process_init_device', 'log_config', - 'log_dataset_uri' + 'log_dataset_uri', ] diff --git a/tests/utils/test_mlflow_logging.py b/tests/utils/test_mlflow_logging.py index b55645b64a..d2dd5f4689 100644 --- a/tests/utils/test_mlflow_logging.py +++ b/tests/utils/test_mlflow_logging.py @@ -7,8 +7,8 @@ import pytest from llmfoundry.utils.config_utils import ( - log_dataset_uri, _parse_source_dataset, + log_dataset_uri, ) mlflow = pytest.importorskip('mlflow') @@ -84,7 +84,7 @@ def test_log_dataset_uri(): }}, source_dataset_train='huggingface/train_dataset', source_dataset_eval='huggingface/eval_dataset', - loggers={'mlflow': {}} + loggers={'mlflow': {}}, ) with patch('mlflow.log_input') as mock_log_input, \