add einsplit for torch

arogozhnikov · May 12, 2024 · 6f4f7cf · 6f4f7cf
1 parent fe9d81d
commit 6f4f7cf
Show file tree

Hide file tree

Showing 2 changed files with 257 additions and 0 deletions.
diff --git a/einops/experimental/einsplit.py b/einops/experimental/einsplit.py
@@ -0,0 +1,213 @@
+"""
+torch-only version for efficient production of multiple outputs from the same input,
+while making all rearranges.
+python 3.9+ because of typing.
+
+implementation is a bit fragile, pin exact version if using.
+Name isn't great, other names under consideration:
+ - multilinear (... confusion with multilinearity),
+ - mergedlinear
+ - multiprojection
+"""
+from typing import Iterable
+
+import torch
+from torch.functional import F
+from torch.nn import ModuleList, Parameter
+
+from einops.einops import _product
+from einops.layers.torch import Rearrange
+from einops.parsing import ParsedExpression
+
+
+def _split_into_groups(pattern: str) -> list[list[str]]:
+    # does not differentiate composed and non-composed ellipsis
+    result: list[list[str]] = []
+    pattern_rest: str = pattern
+    # check there is space before `(` and after `)` for proper style
+    pattern_with_edges = f' {pattern} '
+    msg = f'please add spaces before and after parenthesis in {pattern=}'
+    assert pattern_with_edges.count(' (') == pattern_with_edges.count('('), msg
+    assert pattern_with_edges.count(') ') == pattern_with_edges.count(')'), msg
+
+    while True:
+        if pattern_rest.startswith('('):
+            i = pattern_rest.index(')')
+            group, pattern_rest = pattern_rest[1:i], pattern_rest[i + 1:]
+            assert '(' not in group, 'unbalanced brackets'
+            result.append(group.split())
+        elif '(' in pattern_rest:
+            i = pattern_rest.index('(')
+            ungrouped, pattern_rest = pattern_rest[:i], pattern_rest[i:]
+            assert ')' not in ungrouped, 'unbalanced brackets'
+            result.extend([[x] for x in pattern_rest.split()])
+        else:
+            # no more brackets, just parse the end
+            result.extend([[x] for x in pattern_rest.split()])
+            break
+    return result
+
+
+def _join_groups_to_pattern(groups: list[list[str]]) -> str:
+    result = ''
+    for group in groups:
+        if len(group) == 1:
+            result += f'{group[0]} '
+        else:
+            result += '(' + ' '.join(group) + ') '
+    return result.strip()
+
+
+def _assert_good_identifier(axis_label: str) -> None:
+    valid, reason = ParsedExpression.check_axis_name_return_reason(
+        axis_label, allow_underscore=False
+    )
+    assert valid, f'Bad {axis_label=}, {reason}'
+
+
+def _get_name_for_anon_axis(disallowed_axes: Iterable[str], axis_len: int) -> str:
+    prefix = 'c'
+    while True:
+        prefix += '_'
+        axis_name = f'{prefix}{axis_len}'
+        if axis_name not in disallowed_axes:
+            return axis_name
+
+
+def _process_input_pattern(input_pattern) -> tuple[Rearrange, list[str], int]:
+    """
+    examples of input patterns: 'a 1 (2 3 b) ()'  'c d e f 9'
+    does not support ellipsis, and tagging of variables, like c=4
+    """
+    groups = _split_into_groups(input_pattern)
+
+    all_identifiers = [el.partition('=')[0] for group in groups for el in group if not str.isnumeric(el)]
+    assert len(all_identifiers) == len(set(all_identifiers)), f"duplicate names in {input_pattern=}"
+
+    batch_axes = []
+    input_axes2size = {}
+    named_groups = []
+    for group in groups:
+        named_group = []
+        for axis in group:
+            if '=' in axis:
+                axis_name, _, axis_len_str = axis.partition('=')
+                axis_len = int(axis_len_str)
+                assert axis_len > 0, axis
+                _assert_good_identifier(axis_name)
+                input_axes2size[axis_name] = axis_len
+                named_group.append(axis_name)
+
+            elif str.isnumeric(axis):
+                axis_len = int(axis)
+                assert axis_len > 0, f'{axis_len=}'
+                axis_name = _get_name_for_anon_axis(all_identifiers, axis_len)
+                input_axes2size[axis_name] = axis_len
+                named_group.append(axis_name)
+            else:
+                _assert_good_identifier(axis)
+                batch_axes.append(axis)
+                named_group.append(axis)
+
+        named_groups.append(named_group)
+
+    init_reordering_pattern = _join_groups_to_pattern(named_groups)
+    init_reordering_pattern += ' -> ' + _join_groups_to_pattern([[x] for x in batch_axes] + [list(input_axes2size)])
+    total_input_size = _product(list(input_axes2size.values()))
+
+    return Rearrange(init_reordering_pattern, **input_axes2size), batch_axes, total_input_size
+
+
+def _process_output_pattern(output_pattern, batch_axes) -> tuple[Rearrange, int]:
+    groups = _split_into_groups(output_pattern)
+
+    all_identifiers = [el.partition('=')[0] for group in groups for el in group if not str.isnumeric(el)]
+    assert len(all_identifiers) == len(set(all_identifiers)), f"duplicate names in {output_pattern=}"
+
+    output_axis2size = {}
+    named_groups = []
+    for group in groups:
+        named_group = []
+        for axis in group:
+            assert '=' not in axis, f'wrong identifier {axis=}, no names in outputs'
+            if str.isnumeric(axis):
+                axis_len = int(axis)
+                assert axis_len > 0, f'{axis_len=}'
+                axis_name = _get_name_for_anon_axis(all_identifiers, axis_len)
+                output_axis2size[axis_name] = axis_len
+                named_group.append(axis_name)
+            else:
+                assert axis in batch_axes, f'unknown axis in output, allowed only {batch_axes=}'
+                named_group.append(axis)
+
+        named_groups.append(named_group)
+
+    reordering_pattern = _join_groups_to_pattern([[x] for x in batch_axes] + [[*output_axis2size]])
+    reordering_pattern += ' -> ' + _join_groups_to_pattern(named_groups)
+    total_output_size = _product(list(output_axis2size.values()))
+    return Rearrange(reordering_pattern, **output_axis2size), total_output_size
+
+
+class EinSplit(torch.nn.Module):
+    def __init__(self, input_pattern: str):
+        super().__init__()
+        """all dimensions should be provided in-place"""
+        self.input_pattern = input_pattern
+        self.outputs: list[tuple] = []
+        # intermediate parsing results
+
+        # parsed = ParsedExpression(input_pattern)
+        # if parsed.has_ellipsis:
+        #     raise RuntimeError("no support for ellipsis so far")
+        # self._required_identifiers = parsed.identifiers
+
+        self._in_rearrange, self.batch_axes, self._total_input_size = \
+            _process_input_pattern(input_pattern)
+        self._out_rearranges = ModuleList([])
+        self._out_sizes = []
+        self.linear = None  # set after create_weights
+
+        self.weight = Parameter(torch.empty([0, self._total_input_size]))
+        self.bias = Parameter(torch.empty([0]))
+        self.bias_mask = Parameter(torch.empty([0], dtype=torch.bool), requires_grad=False)
+
+    def add_output(self, pattern: str, init: str = 'xavier_normal', bias: bool = True) -> int:
+        """ returns index in output list """
+        idx = len(self.outputs)
+        out_rearrange, out_total_size = \
+            _process_output_pattern(pattern, batch_axes=self.batch_axes)
+        self.outputs.append((pattern, init, bias))
+        self._out_sizes.append(out_total_size)
+        self._out_rearranges.append(out_rearrange)
+
+        W = self.weight.new_zeros(out_total_size, self._total_input_size)
+        b = self.bias.new_zeros(out_total_size)
+        b_mask = self.bias_mask.new_full(size=(out_total_size,), fill_value=int(bias), dtype=torch.bool)
+
+        if init == 'xavier_normal':
+            torch.nn.init.xavier_normal_(W)  # bias is zero
+        elif init == 'zeros':
+            torch.nn.init.zeros_(W)  # bias is zero
+        else:
+            raise ValueError(f'Unknown {init=}')
+
+        with torch.no_grad():
+            self.weight = Parameter(torch.concatenate([self.weight, W]))
+            self.bias = Parameter(torch.concatenate([self.bias, b]))
+            self.bias_mask = Parameter(torch.concatenate([self.bias_mask, b_mask]), requires_grad=False)
+
+        return idx
+
+    def __repr__(self):
+        output = f"EinSplit({self.input_pattern})"
+        for i, (pattern, init, bias, *_) in self.outputs:
+            output += f'\n + output {i}:   {pattern}; {bias=}, {init=}'
+        return output
+
+    def forward(self, x: torch.Tensor) -> list[torch.Tensor]:
+        merged = F.linear(self._in_rearrange(x), self.weight, self.bias * self.bias_mask)
+        split = torch.split(merged, self._out_sizes, dim=-1)
+        return [
+            rearr_out(x) for rearr_out, x in zip(self._out_rearranges, split)
+        ]
+
diff --git a/tests/test_experimental.py b/tests/test_experimental.py
@@ -0,0 +1,44 @@
+import pytest
+
+from . import is_backend_tested
+
+
+def test_torch_einsplit():
+    if not is_backend_tested("torch"):
+        pytest.skip()
+
+    import torch
+    from einops.experimental.einsplit import EinSplit
+    b = 2
+    s = 3
+    c1 = 5
+    c2 = 7
+    c_out1 = 9
+    c_out2 = 11
+    mod = EinSplit(f'b s {c1=} {c2=}')
+    out1_idx = mod.add_output(f'b s {c_out1}', init='xavier_normal')
+    out2_idx = mod.add_output(f'b s {c_out2}', init='xavier_normal')
+    out3_idx = mod.add_output(f'(3 b 7) s {c_out2}', init='zeros')
+    assert (out1_idx, out2_idx, out3_idx) == (0, 1, 2)
+
+    optim = torch.optim.Adam(mod.parameters(), lr=1e-2)
+    batch = torch.randn(b, s, c1, c2)
+    out1_norms = []
+    out2_norms = []
+    out3_norms = []
+    for iteration in range(100):
+        out1, out2, out3 = mod(batch)
+        loss = out1.norm() + out2.norm()
+        loss.backward()
+        optim.step()
+        optim.zero_grad()
+        out1_norms.append(out1.norm().item())
+        out2_norms.append(out2.norm().item())
+        out3_norms.append(out3.norm().item())
+
+        if iteration % 10 == 0:
+            print(f'{iteration:>5} {loss:6.2f}')
+
+    assert out3_norms[0] == out3_norms[-1] == 0
+    assert out1_norms[0] > 2 * out1_norms[-1]
+    assert out2_norms[0] > 2 * out2_norms[-1]