Revert "BundledAutotuneCache (pytorch#134959)"

This reverts commit 7090211. Reverted pytorch#134959 on behalf of https://github.com/albanD due to The newly added test fails on rocm CI ([comment](pytorch#134959 (comment)))
jessejay-ch · Oct 11, 2024 · 1358969 · 1358969
1 parent 74e8713
commit 1358969
Show file tree

Hide file tree

Showing 7 changed files with 6 additions and 358 deletions.
diff --git a/test/inductor/mock_cache.py b/test/inductor/mock_cache.py
@@ -76,15 +76,13 @@ class _GlobalStats(threading.local):
     def __init__(self) -> None:
         self.autotune_local = _GlobalItemStats()
         self.autotune_remote = _GlobalItemStats()
-        self.bundled_autotune = _GlobalItemStats()
         self.fx_graph = _GlobalItemStats()
         self.triton = _GlobalItemStats()
         self.aot_autograd = _GlobalItemStats()
 
     def reset(self) -> None:
         self.autotune_local.reset()
         self.autotune_remote.reset()
-        self.bundled_autotune.reset()
         self.fx_graph.reset()
         self.triton.reset()
         self.aot_autograd.reset()
@@ -96,7 +94,6 @@ def report(self):
         subs = (
             ("autotune_local", self.autotune_local),
             ("autotune_remote", self.autotune_remote),
-            ("bundled_autotune", self.bundled_autotune),
             ("fx_graph", self.fx_graph),
             ("triton", self.triton),
             ("aot_autograd", self.aot_autograd),
@@ -154,7 +151,7 @@ def _put(self, key: str, data: Any) -> None:
     "fx_graph_remote_cache",
     "autotune_local_cache",
     "autotune_remote_cache",
-    "bundled_autotune_remote_cache",
+    # "bundled_autotune_cache",
 )
 
 
@@ -197,12 +194,6 @@ def __enter__(self) -> Self:
         )
         self._stack.enter_context(ctx)
 
-        ctx = patch(
-            "torch._inductor.remote_cache.RemoteBundledAutotuneCache.backend_override_cls",
-            MockBackend.with_name("bundled_autotune"),
-        )
-        self._stack.enter_context(ctx)
-
         ctx = patch(
             "torch._inductor.remote_cache.RemoteFxGraphCache.backend_override_cls",
             MockBackend.with_name("fx_graph"),
@@ -222,12 +213,6 @@ def __enter__(self) -> Self:
             )
             self._stack.enter_context(ctx)
 
-            ctx = patch(
-                "torch._inductor.fb.remote_cache.FbRemoteBundledAutotuneCache.backend_override_cls",
-                MockBackend.with_name("bundled_autotune"),
-            )
-            self._stack.enter_context(ctx)
-
             ctx = patch(
                 "torch._inductor.fb.remote_cache.FbRemoteFxGraphCache.backend_override_cls",
                 MockBackend.with_name("fx_graph"),

diff --git a/test/inductor/test_codecache.py b/test/inductor/test_codecache.py
@@ -912,7 +912,6 @@ def reset(self):
     @config.patch({"fx_graph_remote_cache": False})
     @config.patch({"autotune_local_cache": False})
     @config.patch({"autotune_remote_cache": True})
-    @config.patch({"bundled_autotune_remote_cache": False})
     @config.patch({"max_autotune": True})
     def test_autotune_cache(self):
         class Model(torch.nn.Module):
@@ -945,51 +944,6 @@ def f(x, y, a, b):
             for k in global_stats.triton.cache.keys():
                 self.assertRegex(k, r"triton:[0-9a-f]{64}::[0-9a-f]{64}:c10")
 
-    @unittest.skipIf(not HAS_CUDA, "Requires CUDA")
-    @unittest.skipIf(not SM80OrLater, "Requires SM80+")
-    @config.patch({"fx_graph_cache": False})
-    @config.patch({"fx_graph_remote_cache": False})
-    @config.patch({"autotune_local_cache": True})
-    @config.patch({"autotune_remote_cache": False})
-    @config.patch({"bundled_autotune_remote_cache": True})
-    @config.patch({"max_autotune": True})
-    def test_bundled_autotune_remote_cache(self):
-        class Model(torch.nn.Module):
-            def forward(self, a, b, c, d, e, f):
-                return a + b, c + d, e + f
-
-        def f(a, b, c, d, e, f):
-            return Model()(a, b, c, d, e, f)
-
-        f_compiled = torch.compile(f, fullgraph=True)
-
-        a = torch.randn(101, 100).cuda()
-        b = torch.randn(101, 100).cuda()
-        c = torch.randn(102, 100).cuda()
-        d = torch.randn(102, 100).cuda()
-        e = torch.randn(103, 100).cuda()
-        f = torch.randn(103, 100).cuda()
-
-        with PatchCaches():
-            f_compiled(a, b, c, d, e, f)
-
-            self.assertEqual(global_stats.autotune_local, Stats(3, 0, 3))
-            self.assertEqual(global_stats.bundled_autotune, Stats(1, 0, 1))
-
-            self.reset()
-            f_compiled(a, b, c, d, e, f)
-
-        self.assertEqual(global_stats.autotune_local, Stats(6, 3, 3))
-        self.assertEqual(global_stats.bundled_autotune, Stats(1, 1, 1))
-
-        # Check that the cache entries seem reasonable
-        for k in global_stats.autotune_local.cache.keys():
-            self.assertRegex(k, r"tmp[^/]*/([^/]{2})/c\1[^/]{49}\.best_config")
-        for k in global_stats.bundled_autotune.cache.keys():
-            self.assertRegex(k, r"pt2:bundled-autotune-v1::[0-9a-z]{64}:c10")
-        for k in global_stats.triton.cache.keys():
-            self.assertRegex(k, r"triton:[0-9a-f]{64}::[0-9a-f]{64}:c10")
-
 
 class TestRemoteAOTAutogradCache(TestCase):
     @unittest.skipIf(not HAS_CUDA, "Requires CUDA")

diff --git a/torch/_inductor/codecache.py b/torch/_inductor/codecache.py
@@ -69,8 +69,6 @@
 from torch._utils_internal import log_cache_bypass
 
 from .remote_cache import create_cache
-from .runtime import autotune_cache
-from .runtime.autotune_cache import AutotuneCacheBundler
 from .utils import _align
 
 
@@ -1118,9 +1116,6 @@ def iterate_over_candidates() -> Generator[CompiledFxGraph, None, None]:
 
             write_atomic(artifact_path, code, make_dirs=True)
 
-        inductor_meta = autotune_cache.inductor_meta_from_config()
-        AutotuneCacheBundler.begin_compile(inductor_meta, code=code)
-
         try:
             graph.current_callable = PyCodeCache.load_by_key_path(
                 graph.cache_key,
@@ -1592,10 +1587,7 @@ def __init__(
 
     def __call__(self, inputs: List[Any]) -> Any:
         assert self.current_callable is not None
-        try:
-            return self.current_callable(inputs)
-        finally:
-            AutotuneCacheBundler.end_compile()
+        return self.current_callable(inputs)
 
 
 def run_command_and_check(cmd_: str) -> None:

diff --git a/torch/_inductor/config.py b/torch/_inductor/config.py
@@ -24,10 +24,6 @@ def autotune_remote_cache_default() -> Optional[bool]:
     return _get_tristate_env("TORCHINDUCTOR_AUTOTUNE_REMOTE_CACHE")
 
 
-def bundled_autotune_remote_cache_default() -> Optional[bool]:
-    return _get_tristate_env("TORCHINDUCTOR_BUNDLED_AUTOTUNE_REMOTE_CACHE")
-
-
 # Enable auto_functionalized_v2 (enabled by default)
 enable_auto_functionalized_v2 = (
     os.environ.get("TORCHDYNAMO_AUTO_FUNCTIONALIZED_V2", "0") == "1"
@@ -62,12 +58,6 @@ def bundled_autotune_remote_cache_default() -> Optional[bool]:
 # None: Not set -- Off for OSS, JustKnobs based for internal
 autotune_remote_cache: Optional[bool] = autotune_remote_cache_default()
 
-# enable bundled autotune cache
-# False: Disables the cache
-# True: Enables the cache
-# None: Not set -- Off for OSS, JustKnobs based for internal
-bundled_autotune_remote_cache: Optional[bool] = bundled_autotune_remote_cache_default()
-
 # Force disabled all inductor level caching -- This will override any other caching flag
 force_disable_caches = os.environ.get("TORCHINDUCTOR_FORCE_DISABLE_CACHES") == "1"
 

diff --git a/torch/_inductor/graph.py b/torch/_inductor/graph.py
@@ -91,8 +91,6 @@
     needs_realized_inputs,
     unsupported_output_tensor,
 )
-from .runtime import autotune_cache
-from .runtime.autotune_cache import AutotuneCacheBundler
 from .scheduler import BaseSchedulerNode
 from .sizevars import SizeVarAllocator
 from .utils import (
@@ -1911,10 +1909,6 @@ def _compile_to_module(self) -> ModuleType:
 
         GraphLowering.save_output_code(code)
         output_code_log.debug("Output code: \n%s", code)
-
-        inductor_meta = autotune_cache.inductor_meta_from_config()
-        AutotuneCacheBundler.begin_compile(inductor_meta, code=code)
-
         try:
             linemap = [(line_no, node.stack_trace) for line_no, node in linemap]  # type: ignore[misc]
             key, path = PyCodeCache.write(code)

diff --git a/torch/_inductor/remote_cache.py b/torch/_inductor/remote_cache.py
@@ -283,10 +283,6 @@ class RemoteAutotuneCache(RedisRemoteCache):
     pass
 
 
-class RemoteBundledAutotuneCache(RedisRemoteCache):
-    pass
-
-
 class RemoteFxGraphCache(RedisRemoteCache):
     pass