Skip to content

Commit

Permalink
Fix CUDA local temp var allocation with base storage
Browse files Browse the repository at this point in the history
  • Loading branch information
kaushikcfd authored and inducer committed Aug 2, 2024
1 parent c73db67 commit ad3618f
Show file tree
Hide file tree
Showing 2 changed files with 81 additions and 29 deletions.
69 changes: 41 additions & 28 deletions loopy/target/c/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@
Const,
Declarator,
Generable,
Initializer,
NestedDeclarator,
Pointer,
)
Expand Down Expand Up @@ -800,7 +801,7 @@ def get_function_definition(
kernel = codegen_state.kernel
assert kernel.linearization is not None

from cgen import FunctionBody, Initializer, Line
from cgen import FunctionBody, Line

result = []

Expand Down Expand Up @@ -883,6 +884,42 @@ def get_kernel_call(self, codegen_state: CodeGenerationState,
lsize: Tuple[ExpressionT, ...]) -> Optional[Generable]:
return None

def emit_temp_var_decl_for_tv_with_base_storage(self,
codegen_state: CodeGenerationState,
tv: TemporaryVariable) -> Generable:
"""
Returns the statement for initializing a :class:`loopy.TemporaryVariable`
with a user-provided :attr:`loopy.TemporaryVariable.base_storage`.
"""
assert tv.base_storage is not None
assert isinstance(tv.address_space, AddressSpace)
ecm = codegen_state.expression_to_code_mapper

cast_decl = POD(self, tv.dtype, "")
temp_var_decl = POD(self, tv.dtype, tv.name)

if tv._base_storage_access_may_be_aliasing:
ptrtype = _ConstPointer
else:
# The 'restrict' part of this is a complete lie--of course
# all these temporaries are aliased. But we're promising to
# not use them to shovel data from one representation to the
# other. That counts, right?
ptrtype = _ConstRestrictPointer

cast_decl = self.wrap_decl_for_address_space(
ptrtype(cast_decl), tv.address_space)
temp_var_decl = self.wrap_decl_for_address_space(
ptrtype(temp_var_decl), tv.address_space)

cast_tp, cast_d = cast_decl.get_decl_pair()
return Initializer(
temp_var_decl,
"({} {}) ({} + {})".format(
" ".join(cast_tp), cast_d, tv.base_storage, ecm(tv.offset)
),
)

def get_temporary_decls(self, codegen_state, schedule_index):
from loopy.kernel.data import AddressSpace

Expand Down Expand Up @@ -930,33 +967,9 @@ def get_temporary_decls(self, codegen_state, schedule_index):

else:
assert tv.initializer is None

cast_decl = POD(self, tv.dtype, "")
temp_var_decl = POD(self, tv.dtype, tv.name)

if tv._base_storage_access_may_be_aliasing:
ptrtype = _ConstPointer
else:
# The 'restrict' part of this is a complete lie--of course
# all these temporaries are aliased. But we're promising to
# not use them to shovel data from one representation to the
# other. That counts, right?
ptrtype = _ConstRestrictPointer

cast_decl = self.wrap_decl_for_address_space(
ptrtype(cast_decl), tv.address_space)
temp_var_decl = self.wrap_decl_for_address_space(
ptrtype(temp_var_decl), tv.address_space)

cast_tp, cast_d = cast_decl.get_decl_pair()
temp_var_decl = Initializer(
temp_var_decl,
"({} {}) ({} + {})".format(
" ".join(cast_tp), cast_d,
tv.base_storage,
ecm(tv.offset)
))

temp_var_decl = self.emit_temp_var_decl_for_tv_with_base_storage(
codegen_state, tv
)
temp_decls_using_base_storage.append(temp_var_decl)

# }}}
Expand Down
41 changes: 40 additions & 1 deletion loopy/target/cuda.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,13 @@
from loopy.codegen.result import CodeGenerationResult
from loopy.diagnostic import LoopyError, LoopyTypeError
from loopy.kernel.array import ArrayBase, FixedStrideArrayDimTag, VectorArrayDimTag
from loopy.kernel.data import AddressSpace, ArrayArg, ConstantArg, ImageArg
from loopy.kernel.data import (
AddressSpace,
ArrayArg,
ConstantArg,
ImageArg,
TemporaryVariable,
)
from loopy.kernel.function_interface import ScalarCallable
from loopy.target.c import CFamilyASTBuilder, CFamilyTarget
from loopy.target.c.codegen.expression import ExpressionToCExpressionMapper
Expand Down Expand Up @@ -462,6 +468,39 @@ def get_image_arg_declarator(
self, arg: ImageArg, is_written: bool) -> Declarator:
raise NotImplementedError("not yet: texture arguments in CUDA")

def emit_temp_var_decl_for_tv_with_base_storage(self,
codegen_state: CodeGenerationState,
tv: TemporaryVariable) -> Generable:
from cgen import Initializer

from loopy.target.c import POD, _ConstPointer, _ConstRestrictPointer

assert tv.base_storage is not None
ecm = codegen_state.expression_to_code_mapper

cast_decl = POD(self, tv.dtype, "")
temp_var_decl = POD(self, tv.dtype, tv.name)

if tv._base_storage_access_may_be_aliasing:
ptrtype = _ConstPointer
else:
# The 'restrict' part of this is a complete lie--of course
# all these temporaries are aliased. But we're promising to
# not use them to shovel data from one representation to the
# other. That counts, right?
ptrtype = _ConstRestrictPointer

cast_decl = ptrtype(cast_decl)
temp_var_decl = ptrtype(temp_var_decl)

cast_tp, cast_d = cast_decl.get_decl_pair()
return Initializer(
temp_var_decl,
"({} {}) ({} + {})".format(
" ".join(cast_tp), cast_d, tv.base_storage, ecm(tv.offset)
),
)

# }}}

# {{{ atomics
Expand Down

0 comments on commit ad3618f

Please sign in to comment.