neuralmagic · Satrat · Nov 14, 2023 · Nov 14, 2023 · Nov 14, 2023 · Nov 14, 2023
diff --git a/src/sparseml/modifiers/obcq/utils/sparsegpt.py b/src/sparseml/modifiers/obcq/utils/sparsegpt.py
@@ -199,7 +199,8 @@ def fasterprune(
                 _LOGGER.debug(torch.sum((self.layer(self._inp1) - self.out1) ** 2))
                 _LOGGER.debug(torch.sum(Losses))
 
-        torch.cuda.synchronize()
+        if torch.cuda.is_available():
+            torch.cuda.synchronize()
         _LOGGER.info("time %.2f" % (time.time() - tick))
         _LOGGER.info("error %.2f" % torch.sum(Losses).item())
 

diff --git a/src/sparseml/transformers/sparsification/obcq/obcq.py b/src/sparseml/transformers/sparsification/obcq/obcq.py
@@ -18,6 +18,7 @@
 from pathlib import Path
 from typing import Optional
 
+import torch
 from torch.nn import Module
 from transformers import AutoConfig
 
@@ -71,6 +72,9 @@ def one_shot(
         if deploy_dir.exists():
             raise RuntimeError(f"deploy_dir={deploy_dir} already exists")
 
+    # fallback to cpu if cuda not available
+    device = _fallback_to_cpu(device)
+
     # Load the configuration from the model path
     config = AutoConfig.from_pretrained(model_path)
     model_type = config.model_type.lower()
@@ -147,6 +151,16 @@ def _save(model, tokenizer, save_path, recipe_path):
         fp.write(load_recipe_yaml_str(recipe_path))
 
 
+def _fallback_to_cpu(device):
+    if "cuda" in device and not torch.cuda.is_available():
+        _LOGGER.warning(
+            f"Requested {device} but CUDA is not available, falling back to CPU"
+        )
+        return "cpu"
+
+    return device
+
+
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
 

diff --git a/src/sparseml/transformers/sparsification/obcq/utils/helpers.py b/src/sparseml/transformers/sparsification/obcq/utils/helpers.py
@@ -42,6 +42,7 @@ def opt_forward(model: Module, data_loader: List, device: str, nsamples: int = N
         dataloader=data_loader,
         device=device,
         nsamples=nsamples,
+        target_ids=["attention_mask"],
         layer_prefix="decoder",
     )
     buffer = [b[0] for b in cached_inputs.pop("inputs")]
@@ -95,6 +96,7 @@ def llama_forward(model: Module, data_loader: List, device: str, nsamples: int =
         dataloader=data_loader,
         device=device,
         nsamples=nsamples,
+        target_ids=["attention_mask", "position_ids"],
         layer_prefix=None,
     )
     buffer = [b[0] for b in cached_inputs.pop("inputs")]

diff --git a/tests/sparseml/transformers/obcq/test_obcq.py b/tests/sparseml/transformers/obcq/test_obcq.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 import pytest
+import torch
 
 from sparseml.modifiers.obcq.utils.helpers import ppl_eval_general
 from sparseml.transformers.data import TransformersDataset
@@ -30,6 +31,8 @@
 def test_obcq_tinystories(recipe_file_path):
     tiny_model_path = "Xenova/llama2.c-stories15M"
     device = "cuda:0"
+    if not torch.cuda.is_available():
+        device = "cpu"
 
     # test recipe with 50% sparsity, quantization and smoothquant
     tiny_model = one_shot(