mosaicml · snarayan21 · Jun 17, 2024 · Jun 11, 2024 · Jun 12, 2024 · Jun 14, 2024
@@ -22,7 +22,7 @@ jobs:
           markers: not daily and not remote and not gpu and not doctest
           pytest_command: coverage run -m pytest
         - name: cpu-3.11-2.3
-          container: mosaicml/pytorch:2.3.1_cu121-python3.11-ubuntu20.04
+          container: mosaicml/pytorch:2.3.1_cpu-python3.11-ubuntu20.04
           markers: not daily and not remote and not gpu and not doctest
           pytest_command: coverage run -m pytest
         - name: cpu-doctest

@@ -83,7 +83,21 @@ def __init__(self, dist_sync_on_step: bool = False, ignore_index: int = -100):
         super().__init__(dist_sync_on_step=dist_sync_on_step)
 
         self.ignore_index = ignore_index
-        self.loss_fn = torch.nn.CrossEntropyLoss(ignore_index=ignore_index, reduction='sum')
+        self.flash_loss_fn = None
+        try:
+            from flash_attn.losses.cross_entropy import CrossEntropyLoss as FusedCrossEntropyLoss
+            log.debug(
+                'Found `flash_attn` installation. Using CrossEntropyLoss from `flash_attn`' +
+                'to compute LanguageCrossEntropy metric for CUDA tensors, which will be faster.',
+            )
+            self.flash_loss_fn = FusedCrossEntropyLoss(ignore_index=ignore_index, reduction='sum')
+        except ImportError:
+            if torch.cuda.is_available():
+                log.debug(
+                    'Package `flash_attn` not installed. Using torch.nn.CrossEntropyLoss ' +
+                    'to compute LanguageCrossEntropy metric for CUDA tensors, which will be slower.',
+                )
+        self.torch_loss_fn = torch.nn.CrossEntropyLoss(ignore_index=ignore_index, reduction='sum')
         self.add_state('sum_loss', default=torch.tensor(0.), dist_reduce_fx='sum')
         self.add_state('total_items', default=torch.tensor(0), dist_reduce_fx='sum')
 
@@ -104,7 +118,11 @@ def update(self, output: Union[Mapping, Tensor], target: Tensor) -> None:
 
         target = target.view(-1)
         logits = logits.view(target.shape[0], -1)
-        losses = self.loss_fn(logits, target)
+        # Use Flash attn's CE loss function, if available, if inputs are both CUDA tensors.
+        if self.flash_loss_fn is not None and target.is_cuda and logits.is_cuda:
+            losses = self.flash_loss_fn(logits, target)
+        else:
+            losses = self.torch_loss_fn(logits, target)
 
         total_items = (target != self.ignore_index).sum()
         self.total_items += total_items  #type: ignore (third-party)