Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,7 @@
"ArithmeticExpression": "obj:pymbolic.ArithmeticExpression",
# pytools
"lp.TemporaryVariable": "class:loopy.TemporaryVariable",
"lp.AddressSpace": "class:loopy.AddressSpace",
}


Expand Down
18 changes: 12 additions & 6 deletions pytato/target/loopy/codegen.py
Original file line number Diff line number Diff line change
Expand Up @@ -586,7 +586,8 @@ def map_index_lambda(self, expr: IndexLambda,
bound_name, (), np.dtype(np.int64), bound_result,
state, self, output_to_temporary=True,
store_inames=(),
result_inames=inames, add_domain=False)]))
result_inames=inames, add_domain=False,
address_space=lp.AddressSpace.PRIVATE)]))
redn_bound_temps[bound_name] = bound_result
new_bound = prim.Variable(bound_name)
else:
Expand Down Expand Up @@ -1098,7 +1099,8 @@ def add_store(
state: CodeGenState, cgen_mapper: CodeGenMapper, *,
tags: frozenset[Tag] | None = None, axes: AxesT | None = None,
output_to_temporary: bool = False, result_inames: tuple[str, ...] | None = None,
store_inames: tuple[str, ...] | None = None, add_domain: bool = True) -> str:
store_inames: tuple[str, ...] | None = None, add_domain: bool = True,
address_space: lp.AddressSpace = lp.AddressSpace.GLOBAL) -> str:
"""Add an instruction that stores to a variable in the kernel.

:param name: name of the output array, which is created
Expand All @@ -1114,6 +1116,9 @@ def add_store(
must be a subset of *result_inames*
:param result_inames: the index inames of the right hand side of the assignment
:param add_domain: add a new domain to the kernel for these inames/shape.
:param address_space: the address space for the temporary variable, when
*output_to_temporary* is ``True``. Defaults to
:attr:`loopy.AddressSpace.GLOBAL`.

:returns: the id of the generated instruction
"""
Expand Down Expand Up @@ -1171,7 +1176,8 @@ def add_store(
kernel = state.kernel

if output_to_temporary:
tvar = get_loopy_temporary(name, shape, dtype, cgen_mapper, state, tags=tags)
tvar = get_loopy_temporary(name, shape, dtype, cgen_mapper, state, tags=tags,
address_space=address_space)
temporary_variables = dict(kernel.temporary_variables)
temporary_variables[name] = tvar
kernel = kernel.copy(temporary_variables=temporary_variables,
Expand Down Expand Up @@ -1235,11 +1241,11 @@ def add_substitution(subst_name: str, ndim: int, result: ImplementedResult,
def get_loopy_temporary(
name: str, shape: ShapeType, dtype: np.dtype[Any],
cgen_mapper: CodeGenMapper, state: CodeGenState, *,
tags: frozenset[Tag] | None = None) -> lp.TemporaryVariable:
tags: frozenset[Tag] | None = None,
address_space: lp.AddressSpace = lp.AddressSpace.GLOBAL
) -> lp.TemporaryVariable:
if tags is None:
tags = frozenset()
# always allocating to global address space to avoid stack overflow
address_space = lp.AddressSpace.GLOBAL
return lp.TemporaryVariable(name,
shape=shape_to_scalar_expression(shape, cgen_mapper, state),
dtype=dtype,
Expand Down
34 changes: 34 additions & 0 deletions test/test_codegen.py
Original file line number Diff line number Diff line change
Expand Up @@ -601,6 +601,40 @@ def test_only_deps_as_knl_args():
assert "y" not in knl.arg_dict


def test_reduction_bound_temps_are_private():
# Regression test for https://github.com/inducer/pytato/issues/648
# (Temporaries for bounds should not be global)
# Reduction bound temporaries (for non-affine bounds) should use
# PRIVATE address space to avoid race conditions in parallel kernels.
n = 5
row_starts = pt.make_placeholder("row_starts", (n+1,), np.int64)
col_indices = pt.make_placeholder("col_indices", (n*3,), np.int64)
values = pt.make_placeholder("values", (n*3,), np.float64)
x = pt.make_placeholder("x", (n,), np.float64)

csr = pt.make_csr_matrix(
shape=(n, n),
elem_values=values,
elem_col_indices=col_indices,
row_starts=row_starts
)

result = csr @ x
knl = pt.generate_loopy(result).kernel

found_bound_temp = False
for name, tv in knl.temporary_variables.items():
if name.endswith(("_lbound", "_ubound")):
found_bound_temp = True
assert tv.address_space == lp.AddressSpace.PRIVATE, (
f"Reduction bound temporary '{name}' should be PRIVATE, "
f"got {tv.address_space}")

assert found_bound_temp, (
"Expected at least one reduction bound temporary whose name ends in "
"'_lbound' or '_ubound', but none were found.")


@pytest.mark.parametrize("dtype", (np.float32, np.float64, np.complex128))
@pytest.mark.parametrize("function_name", ("abs", "sin", "cos", "tan", "arcsin",
"arccos", "arctan", "sinh", "cosh", "tanh", "exp", "log", "log10", "sqrt",
Expand Down
Loading