mosaicml · irenedea · Nov 5, 2023 · Sep 22, 2023 · Oct 17, 2023 · Oct 19, 2023
@@ -9,6 +9,8 @@
 from llmfoundry.data.text_data import (StreamingTextDataset,
                                        build_text_dataloader)
 
+from llmfoundry.data.dataloader import build_dataloader
+
 __all__ = [
     'MixtureOfDenoisersCollator',
     'build_text_denoising_dataloader',
@@ -18,4 +20,5 @@
     'build_text_dataloader',
     'NoConcatDataset',
     'ConcatTokensDataset',
+    'build_dataloader',
 ]
@@ -0,0 +1,46 @@
+# Copyright 2022 MosaicML LLM Foundry authors
+# SPDX-License-Identifier: Apache-2.0
+
+"""Dataloader builder utilities."""
+
+from composer import DataSpec
+from omegaconf import DictConfig
+from transformers import PreTrainedTokenizerBase
+
+from llmfoundry.data.text_data import build_text_dataloader
+
+from llmfoundry.data.denoising import build_text_denoising_dataloader
+
+from llmfoundry.data.finetuning.dataloader import build_finetuning_dataloader
+
+
+def build_dataloader(cfg: DictConfig, tokenizer: PreTrainedTokenizerBase,
+                     device_batch_size: int) -> DataSpec:
+    """Builds a dataloader from a config.
+
+    Args:
+        cfg (DictConfig): An omegaconf dictionary used to configure the loader.
+        tokenizer (PreTrainedTokenizerBase): The tokenizer that the model will use.
+        device_batch_size (int): The size of the batches (number of examples)
+            that the dataloader will produce.
+    """
+    if cfg.name == 'text':
+        return build_text_dataloader(
+            cfg,
+            tokenizer,
+            device_batch_size,
+        )
+    elif cfg.name == 'text_denoising':
+        return build_text_denoising_dataloader(
+            cfg,
+            tokenizer,
+            device_batch_size,
+        )
+    elif cfg.name == 'finetuning':
+        return build_finetuning_dataloader(
+            cfg,
+            tokenizer,
+            device_batch_size,
+        )
+    else:
+        raise ValueError(f'Not sure how to build dataloader with config: {cfg}')
@@ -16,7 +16,7 @@
 from torch.utils.data import DataLoader
 from transformers import PreTrainedTokenizerBase
 
-from llmfoundry.data.packing import BinPackWrapper
+from llmfoundry.data.packing import BinPackCollator
 from llmfoundry.data.text_data import (StreamingTextDataset,
                                        get_tokens_per_batch_func)
 from llmfoundry.models import utils
@@ -387,7 +387,7 @@ def build_text_denoising_dataloader(
                     packing.
                     Select packing_ratio **carefully** based on the dataset
                     statistics, max_seq_len, and tolerance for discarding samples!
-                    The packing code in `./packing.py` provides a script that can help
+                    The script `scripts/misc/profile_packing.py` can help
                     you choose the best packing_ratio.
             See :class:`StreamingTextDataset` for info on other standard config
                 options within `cfg.dataset`.
@@ -419,7 +419,7 @@ def build_text_denoising_dataloader(
             that the dataloader will produce.
 
     Note:
-        You can run the script inside `./packing.py` to quickly test the
+        You can use the script `scripts/misc/profile_packing.py` to quickly test the
         padding/waste rates for different `cfg.dataset.packing_ratio` choices,
         given a starting workload YAML.
     """
@@ -492,7 +492,7 @@ def build_text_denoising_dataloader(
             raise NotImplementedError(
                 'On-the-fly packing is currently only supported for decoder-only formats.'
             )
-        collate_fn = BinPackWrapper(
+        collate_fn = BinPackCollator(
             collator=collate_fn,
             target_batch_size=device_batch_size,
             max_seq_len=cfg.dataset.max_seq_len,

@@ -14,7 +14,7 @@
 
 from llmfoundry.data.finetuning.collator import Seq2SeqFinetuningCollator
 from llmfoundry.data.finetuning.tasks import dataset_constructor
-from llmfoundry.data.packing import BinPackWrapper
+from llmfoundry.data.packing import BinPackCollator, auto_packing_ratio
 from llmfoundry.data.text_data import get_tokens_per_batch_func
 
 log = logging.getLogger(__name__)
@@ -86,7 +86,7 @@ def build_finetuning_dataloader(cfg: DictConfig,
                     packing.
                     Select `packing_ratio` **carefully** based on the dataset
                     statistics, `max_seq_len`, and tolerance for discarding samples!
-                    The packing code in `../packing.py` provides a script that can help
+                    `scripts/misc/profile_packing.py` is a script that can help
                     you choose the best `packing_ratio`.
             cfg.dataset.shuffle (bool): Whether to shuffle the dataset.
             ___
@@ -106,7 +106,7 @@ def build_finetuning_dataloader(cfg: DictConfig,
         A pytorch dataloader
 
     Note:
-        You can run the script inside `../packing.py` to quickly test the
+        You can run the script inside `scripts/misc/profile_packing.py` to quickly test the
         padding/waste rates for different `cfg.dataset.packing_ratio` choices,
         given a starting workload YAML.
     """
@@ -143,7 +143,7 @@ def build_finetuning_dataloader(cfg: DictConfig,
         )
 
         collate_fn, dataloader_batch_size = _build_collate_fn(
-            cfg.dataset, tokenizer, device_batch_size)
+            cfg, tokenizer, device_batch_size)
 
         dl = DataLoader(
             dataset,
@@ -174,7 +174,7 @@ def build_finetuning_dataloader(cfg: DictConfig,
             )
 
         collate_fn, dataloader_batch_size = _build_collate_fn(
-            cfg.dataset, tokenizer, device_batch_size)
+            cfg, tokenizer, device_batch_size)
 
         if cfg.drop_last:
             world_size = dist.get_world_size()
@@ -367,9 +367,10 @@ def _build_hf_dataset_from_remote(
 
 
 def _build_collate_fn(
-    dataset_cfg: DictConfig, tokenizer: PreTrainedTokenizerBase,
+    dataloader_cfg: DictConfig, tokenizer: PreTrainedTokenizerBase,
     device_batch_size: int
-) -> Tuple[Union[Seq2SeqFinetuningCollator, BinPackWrapper], int]:
+) -> Tuple[Union[Seq2SeqFinetuningCollator, BinPackCollator], int]:
+    dataset_cfg = dataloader_cfg.dataset
     collate_fn = Seq2SeqFinetuningCollator(
         tokenizer=tokenizer,
         max_seq_len=dataset_cfg.max_seq_len,
@@ -386,6 +387,10 @@ def _build_collate_fn(
                 'the latter to turn on packing or remove the former from the config.')
         return collate_fn, device_batch_size
 
+    if packing_ratio == 'auto':
+        packing_ratio = auto_packing_ratio(dataloader_cfg, tokenizer,
+                                           device_batch_size)
+
     if packing_ratio == 1.0:
         return collate_fn, device_batch_size
     elif packing_ratio < 1.0:
@@ -396,7 +401,7 @@ def _build_collate_fn(
             'On-the-fly packing is currently only supported for decoder-only formats.'
         )
 
-    collate_fn = BinPackWrapper(
+    collate_fn = BinPackCollator(
         collator=collate_fn,
         target_batch_size=device_batch_size,
         max_seq_len=dataset_cfg.max_seq_len,