Merge branch 'main' into wanda

neuralmagic · Nov 27, 2023 · c3e78a0 · c3e78a0
2 parents 35ab428 + 0946ca8
commit c3e78a0
Show file tree

Hide file tree

Showing 20 changed files with 2,436 additions and 38 deletions.
diff --git a/.github/workflows/test-check.yaml b/.github/workflows/test-check.yaml
@@ -197,3 +197,27 @@ jobs:
         run: pip3 install .[dev,torchvision,onnxruntime] torch==1.9.1
       - name: "🔬 Running onnx tests"
         run: make test TARGETS=onnx
+  transformers-tests:
+    runs-on: ubuntu-22.04
+    env:
+      SPARSEZOO_TEST_MODE: "true"
+    needs: test-setup
+    if: ${{needs.test-setup.outputs.pytorch == 1}}
+    steps:
+      - uses: actions/setup-python@v4
+        with:
+          python-version: '3.11'
+      - uses: actions/checkout@v2
+      - uses: actions/checkout@v2
+        with:
+          repository: "neuralmagic/sparsezoo"
+          path: "sparsezoo"
+          ref: ${{needs.test-setup.outputs.branch}}
+      - name: "⚙️ Install sparsezoo dependencies"
+        run: pip3 install -U pip && pip3 install setuptools sparsezoo/
+      - name: "Clean sparsezoo directory"
+        run: rm -r sparsezoo/
+      - name: "⚙️ Install dependencies"
+        run: pip3 install .[dev,torch,transformers]
+      - name: "🔬 Running transformers tests"
+        run: make test TARGETS=transformers
diff --git a/setup.py b/setup.py
@@ -63,17 +63,17 @@
 
 _onnxruntime_deps = ["onnxruntime>=1.0.0"]
 _clip_deps = ["open_clip_torch==2.20.0"]
-supported_torch_version = "torch>=1.7.0,<=2.0"
+supported_torch_version = "torch>=1.7.0,<2.2"
 _pytorch_deps = [
     supported_torch_version,
     "gputils",
 ]
 _pytorch_all_deps = _pytorch_deps + [
-    "torchvision>=0.3.0,<=0.15.1",
+    "torchvision>=0.3.0,<0.17",
     "torchaudio<=2.0.1",
 ]
 _pytorch_vision_deps = _pytorch_deps + [
-    "torchvision>=0.3.0,<=0.15.1",
+    "torchvision>=0.3.0,<0.17",
     "opencv-python<=4.6.0.66",
 ]
 _transformers_deps = _pytorch_deps + [
@@ -103,24 +103,26 @@
     "black==22.12.0",
     "flake8==3.9.2",
     "isort==5.8.0",
-    "m2r2~=0.2.7",
+    "wheel>=0.36.2",
+    "pytest>=6.0.0",
+    "pytest-mock>=3.6.0",
+    "flaky~=3.7.0",
+    "tensorboard>=1.0,<2.9",
+    "tensorboardX>=1.0",
+]
+
+_docs_deps = [
+    "m2r2>=0.2.7",
     "mistune<3,>=2.0.3",
-    "myst-parser~=0.14.0",
+    "myst-parser>=0.14.0",
     "rinohtype~=0.4.2",
     "sphinx~=3.5.0",
     "sphinx-copybutton~=0.3.0",
     "sphinx-markdown-tables~=0.0.15",
     "sphinx-multiversion~=0.2.4",
     "sphinx-pydantic~=0.1.0",
     "sphinx-rtd-theme~=0.5.0",
-    "wheel>=0.36.2",
-    "pytest~=6.2.0",
-    "pytest-mock~=3.6.0",
-    "flaky~=3.7.0",
-    "sphinx-rtd-theme",
     "docutils<0.17",
-    "tensorboard>=1.0,<2.9",
-    "tensorboardX>=1.0",
 ]
 
 
@@ -148,6 +150,7 @@ def _setup_extras() -> Dict:
     return {
         "clip": _clip_deps,
         "dev": _dev_deps,
+        "docs": _docs_deps,
         "deepsparse": _deepsparse_deps,
         "deepsparse-ent": _deepsparse_ent_deps,
         "openpifpaf": _open_pif_paf_deps,

diff --git a/src/sparseml/exporters/transforms/flatten_qparams.py b/src/sparseml/exporters/transforms/flatten_qparams.py
@@ -50,7 +50,8 @@ def transform(self, model: ModelProto) -> ModelProto:
                 continue
             self.log_match(init)
             a = numpy_helper.to_array(init)
-            assert a.shape == (1,)
+            if a.shape != (1,):
+                continue  # assume qparam is already flattened
             b = numpy.array(a[0])
             assert b.shape == ()
             assert b.dtype == a.dtype

diff --git a/src/sparseml/pytorch/base.py b/src/sparseml/pytorch/base.py
@@ -49,7 +49,7 @@
 
 
 _TORCH_MIN_VERSION = "1.0.0"
-_TORCH_MAX_VERSION = os.environ.get("MAX_TORCH", "2.0.100")
+_TORCH_MAX_VERSION = os.environ.get("MAX_TORCH", "2.1.10")
 
 
 def check_torch_install(

diff --git a/src/sparseml/pytorch/sparsification/pruning/__init__.py b/src/sparseml/pytorch/sparsification/pruning/__init__.py
@@ -22,6 +22,7 @@
 from .mask_creator import *
 from .mask_params import *
 from .modifier_as import *
+from .modifier_powerpropagation import *
 from .modifier_pruning_acdc import *
 from .modifier_pruning_base import *
 from .modifier_pruning_constant import *
@@ -30,5 +31,7 @@
 from .modifier_pruning_mfac import *
 from .modifier_pruning_movement import *
 from .modifier_pruning_obs import *
+from .modifier_pruning_rigl import *
 from .modifier_pruning_structured import *
+from .modifier_pruning_topkast import *
 from .scorer import *
diff --git a/src/sparseml/pytorch/sparsification/pruning/mask_params.py b/src/sparseml/pytorch/sparsification/pruning/mask_params.py
@@ -59,6 +59,8 @@ class ModuleParamPruningMask(object):
         sparsity ranking values within each individual tensor. Default is False
     :param allow_reintroduction: set True to not mask weights and gradients between
         forward passes (forward mask hooks will remain). Default is False
+    :param mask_gradients_only: Apply the mask to the gradients only and not to
+        the weights (at any point). Default is False.
     """
 
     def __init__(
@@ -69,6 +71,7 @@ def __init__(
         param_names: Union[str, List[str]] = "weight",
         store_init: bool = False,
         store_unmasked: bool = False,
+        mask_gradients_only: bool = False,
         track_grad_mom: float = -1.0,
         layer_names: Optional[List[str]] = None,
         global_sparsity: bool = False,
@@ -86,6 +89,7 @@ def __init__(
         self._layer_names = layer_names
         self._store_init = store_init
         self._store_unmasked = store_unmasked
+        self._mask_gradients_only = mask_gradients_only
         self._track_grad_mom = track_grad_mom
         self._global_sparsity = global_sparsity
 
@@ -114,6 +118,7 @@ def __init__(
         self._params_grad = [None] * len(self._layers)  # type: List[Tensor]
         self._params_movement = [None] * len(self._layers)  # type: List[Tensor]
         self._params_applied_thinning = [0.0] * len(self._layers)  # type: List[float]
+        self._mask_applied = [False] * len(self._layers)  # type: bool
 
         # movement pruning requires weight reintroduction
         self._allow_reintroduction = allow_reintroduction
@@ -299,7 +304,7 @@ def set_param_data(self, value: Tensor, param_idx: int):
         self._params_unmasked[param_idx] = None
         self._setup_params_unmasked(param_idx)
 
-        if not self._allow_reintroduction:
+        if not self._allow_reintroduction and not self._mask_gradients_only:
             self.apply(param_idx)
 
     def set_param_masks(self, masks: List[Tensor]):
@@ -330,7 +335,7 @@ def set_param_masks(self, masks: List[Tensor]):
         if self._scorer:
             self._scorer.mask_update(masks, mask_diffs)
 
-        if not self._allow_reintroduction:
+        if not self._allow_reintroduction and not self._mask_gradients_only:
             self.apply()
 
         return mask_diffs
@@ -395,11 +400,27 @@ def apply(self, param_idx: Optional[int] = None):
             self._check_regen_param_vals(idx)
 
             with torch.no_grad():
+                # In the case of forward-pass-only masks (Top-KAST, Movement
+                # pruning), the mask is applied on the forward pass and
+                # reverted on the backward pass. At the same time, every time the
+                # mask is applied, we store the previous values in
+                # _params_unmasked. So long as we alternate forward and backward
+                # passes (i.e., during training), this works fine. However, if
+                # we only do forward passes (i.e., during testing/validation),
+                # we can override the unmasked parameters with sparse ones. To
+                # prevent this, only update the unmasked params cache when the
+                # mask is applied for the first time since it was removed.
+                #
+                # Note that there is an assumption here that the weights do not
+                # change when the mask is applied (which is satisfied during
+                # training, since the mask is removed on every backward pass).
                 if self._store_unmasked:
-                    self._params_unmasked[idx] = self._params[idx].data.mul(
-                        1 - self._param_masks[idx]  # inverted mask
-                    )
+                    if not self._mask_applied[idx]:
+                        self._params_unmasked[idx] = self._params[idx].data.mul(
+                            1 - self._param_masks[idx]  # inverted mask
+                        )
                 self._params[idx].data.mul_(self._param_masks[idx])
+                self._mask_applied[idx] = True
 
     def reset(self):
         """
@@ -429,7 +450,8 @@ def pruning_end(self, leave_enabled: bool):
         if not leave_enabled:
             self.enabled = False
         self._allow_reintroduction = False
-        self.apply()  # ensure that weights are pruned to final level
+        if not self._mask_gradients_only:
+            self.apply()  # ensure that weights are pruned to final level
         if self._scorer:
             self._scorer.on_pruning_end()
 
@@ -501,15 +523,20 @@ def _check_regen_param_vals(self, param_idx: int = None):
 
     def _create_hooks(self):
         for idx, (param, layer) in enumerate(zip(self._params, self._layers)):
-            if self._forward_hooks[idx] is None:
-                self._forward_hooks[idx] = layer.register_forward_pre_hook(
-                    partial(self._hook_mask_forward, idx)
-                )
+            if not self._mask_gradients_only:
+                if self._forward_hooks[idx] is None:
+                    self._forward_hooks[idx] = layer.register_forward_pre_hook(
+                        partial(self._hook_mask_forward, idx)
+                    )
 
-            if self._allow_reintroduction and self._undo_mask_hooks[idx] is None:
-                self._undo_mask_hooks[idx] = layer.register_forward_hook(
-                    partial(self._hook_undo_mask, idx)
-                )
+                if (
+                    self._allow_reintroduction
+                    and self._undo_mask_hooks[idx] is None
+                    and not self._mask_gradients_only
+                ):
+                    self._undo_mask_hooks[idx] = layer.register_full_backward_hook(
+                        partial(self._hook_undo_mask, idx)
+                    )
 
             if self._gradient_hooks[idx] is None:
                 self._gradient_hooks[idx] = param.register_hook(
@@ -536,23 +563,24 @@ def _delete_hooks(self):
     def _hook_mask_forward(
         self, param_idx: int, mod: Module, inp: Union[Tensor, Tuple[Tensor]]
     ):
-        self.apply(param_idx)
+        with torch.no_grad():
+            self.apply(param_idx)
 
     def _hook_undo_mask(self, param_idx, module, inp, out):
         if self._allow_reintroduction:
             with torch.no_grad():
                 self._params[param_idx].data.add_(self._params_unmasked[param_idx])
+            self._mask_applied[param_idx] = False
 
     def _hook_mask_gradient(self, param_idx, grad):
         if 0.0 <= self._track_grad_mom < 1.0:
             self._params_grad[param_idx].mul_(self._track_grad_mom).add_(
                 (1.0 - self._track_grad_mom) * grad
             )
-
         return (
             grad.mul_(self._param_masks[param_idx])
-            if not self._allow_reintroduction
-            else grad  # do not mask gradient for movement pruning
+            if self._mask_gradients_only or not self._allow_reintroduction
+            else grad
         )
 
     def _setup_params_init(self):