[cudagraphs] Fix issue in collecting static_input_idxs (#152287)

bdhirsh · pytorchmergebot · commit 4a63cab624a7 · 2025-04-30T03:24:05.000Z
related to #152275 Pull Request resolved: #152287 Approved by: https://github.com/bdhirsh, https://github.com/eellison Co-authored-by: Brian Hirsh <hirsheybar@fb.com>
diff --git a/test/dynamo/test_subclasses.py b/test/dynamo/test_subclasses.py
@@ -2135,9 +2135,9 @@ def inner_compile(
             extern_node_serializer: Optional[Callable[[list[Any]], Any]] = None,
         ):
             if dynamic:
-                self.assertEqual(static_input_idxs, [0, 1, 2, 3, 4])
+                self.assertEqual(static_input_idxs, [2, 3, 4])
             else:
-                self.assertEqual(static_input_idxs, [0, 1, 2])
+                self.assertEqual(static_input_idxs, [1, 2])
             return gm
 
         compiler = functools.partial(compile_fx, inner_compile=inner_compile)
diff --git a/test/inductor/test_cudagraph_trees.py b/test/inductor/test_cudagraph_trees.py
@@ -2426,6 +2426,40 @@ def fn(x, y):
             self.run_static_input_param_test(fn, 4)
             self.assertEqual(counters["inductor"]["cudagraph_skips"], 0)
 
+        @torch._dynamo.config.patch("error_on_recompile", True)
+        @torch._dynamo.config.patch("inline_inbuilt_nn_modules", True)
+        def test_no_rerecord_with_mark_static_address(self):
+            class Mod(torch.nn.Module):
+                def __init__(self):
+                    super().__init__()
+                    self.linear = nn.Linear(2, 2)
+
+                def forward(self, x):
+                    return self.linear(x)
+
+            mod = Mod().cuda()
+
+            def fn_eager(x, marked_static_y):
+                return torch.cos(x) + mod(marked_static_y)
+
+            with torch.device("cuda"):
+                fn_compiled = torch.compile(fn_eager, mode="reduce-overhead")
+
+                # y is marked static
+                y = torch.randn(2, 2)
+                torch._dynamo.mark_static_address(y)
+
+                # Chanhing pointer of x should not lead to re-records
+                for _ in range(5):
+                    x = torch.randn(2, 2, requires_grad=True)
+                    res = fn_compiled(x, y)
+                    res.sum().backward()
+                    x.grad = None
+                    mod.linear.weight.grad = None
+                    mod.linear.bias.grad = None
+                # One forward and one backward
+                self.assertEqual(self.get_manager().new_graph_id().id, 2)
+
         def test_tensor_constant_mutation(self):
             class Foo(torch.nn.Module):
                 def __init__(self) -> None:
diff --git a/test/inductor/test_inductor_freezing.py b/test/inductor/test_inductor_freezing.py
@@ -1,5 +1,6 @@
 # Owner(s): ["module: inductor"]
 import contextlib
+import copy
 import functools
 import importlib
 import itertools
@@ -375,6 +376,35 @@ def foo(mod, x):
         ):
             mod(x)
 
+    def test_static_indices_cudagraph(self):
+        if self.device != "cuda":
+            return
+
+        mod1 = torch.nn.Sequential(
+            torch.nn.Linear(2, 2).to(self.device), torch.nn.Linear(2, 2).to(self.device)
+        )
+        mod2 = copy.deepcopy(mod1)
+
+        def fn(x, y, mod):
+            x.add_(1)
+            getattr(mod, "0").bias.add_(2)
+            getattr(mod, "1").weight.add_(3)
+            return mod(x) + y
+
+        x1 = torch.randn(2, 2, device=self.device)
+        y1 = torch.randn(2, 2, device=self.device)
+        x2 = x1.clone()
+        y2 = y1.clone()
+
+        opt_fn = torch.compile(fn, mode="reduce-overhead")
+
+        with torch.no_grad():
+            ref = fn(x1, y1, mod1)
+            res = opt_fn(x2, y2, mod2)
+        self.assertEqual(ref, res)
+        self.assertEqual(x1, x2)
+        self.assertEqual(y1, y2)
+
     def test_rng_op(self):
         @torch.compile()
         def foo():
diff --git a/torch/_functorch/aot_autograd.py b/torch/_functorch/aot_autograd.py
@@ -1028,18 +1028,20 @@ def _try_get_metadata_from_dynamo(
     seen_sources = set()
 
     aot_autograd_arg_pos_to_source = []
+    static_input_indices = []
     # Collect the new inputs lifted by aotdispatch
-    for name in param_keys:
+    for i, name in enumerate(param_keys):
         assert name in param_name_to_source, f"{name} not found."
         source = param_name_to_source[name]
         assert source not in seen_sources, source
         seen_sources.add(source)
         aot_autograd_arg_pos_to_source.append(source)
 
+        static_input_indices.append(i)
+
     # Collect the dynamo graph inputs
     # TODO(mlazos): Revisit if this is still needed. With Dynamo install ID
     # matched tensors back into the Fx graph, this might not be necessary.
-    static_input_indices = []
     for pos, node in enumerate(mod.graph.find_nodes(op="placeholder")):
         assert hasattr(node, "_dynamo_source")
         source = node._dynamo_source
@@ -1048,16 +1050,22 @@ def _try_get_metadata_from_dynamo(
         aot_autograd_arg_pos_to_source.append(source)
         source_name = source.name() if source else str(source)
 
+        # input[i] in dynamo is now:
+        # input[i + len(extra_params)] in AOT,
+        # where extra_params are the params/buffers that dynamo baked into the
+        # OutputGraph
+        actual_pos = pos + len(param_keys)
+
         if "tensor_dict" in node.meta and node.meta["tensor_dict"].get(
             "_dynamo_static_input_type", None
         ):
             static_inputs_log.debug(
-                "Adding static input pos %s for source %s", pos, source_name
+                "Adding static input pos %s for source %s", actual_pos, source_name
             )
-            static_input_indices.append(pos)
+            static_input_indices.append(actual_pos)
         else:
             static_inputs_log.debug(
-                "Non-static input pos %s for source %s", pos, source_name
+                "Non-static input pos %s for source %s", actual_pos, source_name
             )
 
     assert full_args_num == len(aot_autograd_arg_pos_to_source)
diff --git a/torch/_inductor/compile_fx.py b/torch/_inductor/compile_fx.py
@@ -212,7 +212,7 @@ def get_static_input_idxs(num_fixed: int) -> list[int]:
     if not context or not context.fw_metadata:
         return fixed
 
-    return fixed + context.fw_metadata.static_input_indices
+    return context.fw_metadata.static_input_indices
 
 
 def record_original_output_strides(gm: GraphModule) -> None:
@@ -1745,7 +1745,6 @@ def fw_compiler_freezing(
     )
 
     aot_example_inputs = [aot_example_inputs[ind] for ind in preserved_arg_indices]
-    num_fixed = len(preserved_arg_indices) - num_example_inputs
 
     fake_mode = detect_fake_mode(aot_example_inputs)
 
@@ -1756,7 +1755,7 @@ def fw_compiler_freezing(
         idx for idx, n in enumerate(model_outputs) if isinstance(n, torch.fx.Node)
     ]
 
-    static_input_idxs = list(range(num_fixed))
+    static_input_idxs = []
     # constant params will be real tensors, not fake
     tracing_context = torch._guards.TracingContext.try_get()
     unwrapped_args_offsets = [0]
@@ -1788,7 +1787,7 @@ def fw_compiler_freezing(
                 tracing_context.params_flat[i] = None
 
         if tracing_context.fw_metadata:
-            static_input_idxs += tracing_context.fw_metadata.static_input_indices
+            static_input_idxs = tracing_context.fw_metadata.static_input_indices
 
     with mock.patch.object(fake_mode, "allow_non_fake_inputs", True):
         optimized_function = inner_compile(
diff --git a/torch/_inductor/freezing.py b/torch/_inductor/freezing.py
@@ -52,14 +52,21 @@ def replace_params_with_constants(
         in (MutationType.MUTATED_IN_GRAPH, MutationType.MUTATED_OUT_GRAPH)
     ]
 
+    static_indices_new = []
+    static_indices_offset = 0
     for i, (real_input, node) in enumerate(zip(flat_params, fake_inp_nodes)):
         if i in mutated_inps or i in aliased_input_args:
             preserved_arg_indices.append(i)
-            continue
-        replace_node_with_constant(gm, node, real_input)
+            if i in fw_metadata.static_input_indices:
+                new_static_index = i - static_indices_offset
+                static_indices_new.append(new_static_index)
+        else:
+            replace_node_with_constant(gm, node, real_input)
+            static_indices_offset += 1
     # add on non param inputs
     preserved_arg_indices.extend(range(len(flat_params), len(params)))
     # is this necessary ?
+    fw_metadata.static_input_indices = static_indices_new
     gm.recompile()
     return preserved_arg_indices