From 8dc6267bc0fdcc4786b374295d92b0634d983294 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Tue, 17 Mar 2026 19:07:41 +0000 Subject: [PATCH] Fix race condition: make reduction bound temporaries use PRIVATE address space Co-authored-by: inducer <352067+inducer@users.noreply.github.com> Potential fix for pull request finding Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com> --- doc/conf.py | 1 + pytato/target/loopy/codegen.py | 18 ++++++++++++------ test/test_codegen.py | 34 ++++++++++++++++++++++++++++++++++ 3 files changed, 47 insertions(+), 6 deletions(-) diff --git a/doc/conf.py b/doc/conf.py index cbf7dc94c..6d0f7a95e 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -72,6 +72,7 @@ "ArithmeticExpression": "obj:pymbolic.ArithmeticExpression", # pytools "lp.TemporaryVariable": "class:loopy.TemporaryVariable", + "lp.AddressSpace": "class:loopy.AddressSpace", } diff --git a/pytato/target/loopy/codegen.py b/pytato/target/loopy/codegen.py index abf92e96a..22edf06e3 100644 --- a/pytato/target/loopy/codegen.py +++ b/pytato/target/loopy/codegen.py @@ -586,7 +586,8 @@ def map_index_lambda(self, expr: IndexLambda, bound_name, (), np.dtype(np.int64), bound_result, state, self, output_to_temporary=True, store_inames=(), - result_inames=inames, add_domain=False)])) + result_inames=inames, add_domain=False, + address_space=lp.AddressSpace.PRIVATE)])) redn_bound_temps[bound_name] = bound_result new_bound = prim.Variable(bound_name) else: @@ -1098,7 +1099,8 @@ def add_store( state: CodeGenState, cgen_mapper: CodeGenMapper, *, tags: frozenset[Tag] | None = None, axes: AxesT | None = None, output_to_temporary: bool = False, result_inames: tuple[str, ...] | None = None, - store_inames: tuple[str, ...] | None = None, add_domain: bool = True) -> str: + store_inames: tuple[str, ...] | None = None, add_domain: bool = True, + address_space: lp.AddressSpace = lp.AddressSpace.GLOBAL) -> str: """Add an instruction that stores to a variable in the kernel. :param name: name of the output array, which is created @@ -1114,6 +1116,9 @@ def add_store( must be a subset of *result_inames* :param result_inames: the index inames of the right hand side of the assignment :param add_domain: add a new domain to the kernel for these inames/shape. + :param address_space: the address space for the temporary variable, when + *output_to_temporary* is ``True``. Defaults to + :attr:`loopy.AddressSpace.GLOBAL`. :returns: the id of the generated instruction """ @@ -1171,7 +1176,8 @@ def add_store( kernel = state.kernel if output_to_temporary: - tvar = get_loopy_temporary(name, shape, dtype, cgen_mapper, state, tags=tags) + tvar = get_loopy_temporary(name, shape, dtype, cgen_mapper, state, tags=tags, + address_space=address_space) temporary_variables = dict(kernel.temporary_variables) temporary_variables[name] = tvar kernel = kernel.copy(temporary_variables=temporary_variables, @@ -1235,11 +1241,11 @@ def add_substitution(subst_name: str, ndim: int, result: ImplementedResult, def get_loopy_temporary( name: str, shape: ShapeType, dtype: np.dtype[Any], cgen_mapper: CodeGenMapper, state: CodeGenState, *, - tags: frozenset[Tag] | None = None) -> lp.TemporaryVariable: + tags: frozenset[Tag] | None = None, + address_space: lp.AddressSpace = lp.AddressSpace.GLOBAL + ) -> lp.TemporaryVariable: if tags is None: tags = frozenset() - # always allocating to global address space to avoid stack overflow - address_space = lp.AddressSpace.GLOBAL return lp.TemporaryVariable(name, shape=shape_to_scalar_expression(shape, cgen_mapper, state), dtype=dtype, diff --git a/test/test_codegen.py b/test/test_codegen.py index 073b80eca..ffba83bfd 100755 --- a/test/test_codegen.py +++ b/test/test_codegen.py @@ -601,6 +601,40 @@ def test_only_deps_as_knl_args(): assert "y" not in knl.arg_dict +def test_reduction_bound_temps_are_private(): + # Regression test for https://github.com/inducer/pytato/issues/648 + # (Temporaries for bounds should not be global) + # Reduction bound temporaries (for non-affine bounds) should use + # PRIVATE address space to avoid race conditions in parallel kernels. + n = 5 + row_starts = pt.make_placeholder("row_starts", (n+1,), np.int64) + col_indices = pt.make_placeholder("col_indices", (n*3,), np.int64) + values = pt.make_placeholder("values", (n*3,), np.float64) + x = pt.make_placeholder("x", (n,), np.float64) + + csr = pt.make_csr_matrix( + shape=(n, n), + elem_values=values, + elem_col_indices=col_indices, + row_starts=row_starts + ) + + result = csr @ x + knl = pt.generate_loopy(result).kernel + + found_bound_temp = False + for name, tv in knl.temporary_variables.items(): + if name.endswith(("_lbound", "_ubound")): + found_bound_temp = True + assert tv.address_space == lp.AddressSpace.PRIVATE, ( + f"Reduction bound temporary '{name}' should be PRIVATE, " + f"got {tv.address_space}") + + assert found_bound_temp, ( + "Expected at least one reduction bound temporary whose name ends in " + "'_lbound' or '_ubound', but none were found.") + + @pytest.mark.parametrize("dtype", (np.float32, np.float64, np.complex128)) @pytest.mark.parametrize("function_name", ("abs", "sin", "cos", "tan", "arcsin", "arccos", "arctan", "sinh", "cosh", "tanh", "exp", "log", "log10", "sqrt",