diff --git a/.gitignore b/.gitignore index b1c8ed26e9..ebbbfaebeb 100644 --- a/.gitignore +++ b/.gitignore @@ -6,6 +6,7 @@ _local /src/__init__.py /tests/__init__.py .gt_cache/ +.gt4py_cache/ .gt_cache_pytest*/ # DaCe diff --git a/src/gt4py/cartesian/gtc/common.py b/src/gt4py/cartesian/gtc/common.py index ef38a9a658..60236a3e97 100644 --- a/src/gt4py/cartesian/gtc/common.py +++ b/src/gt4py/cartesian/gtc/common.py @@ -118,7 +118,7 @@ def isbool(self): return self == self.BOOL def isinteger(self): - return self in (self.INT8, self.INT32, self.INT64) + return self in (self.INT8, self.INT16, self.INT32, self.INT64) def isfloat(self): return self in (self.FLOAT32, self.FLOAT64) diff --git a/src/gt4py/cartesian/gtc/dace/daceir.py b/src/gt4py/cartesian/gtc/dace/daceir.py index 492a9598c5..43a33fdd6d 100644 --- a/src/gt4py/cartesian/gtc/dace/daceir.py +++ b/src/gt4py/cartesian/gtc/dace/daceir.py @@ -734,7 +734,13 @@ class ScalarAccess(common.ScalarAccess, Expr): class VariableKOffset(common.VariableKOffset[Expr]): - pass + @datamodels.validator("k") + def no_casts_in_offset_expression(self, _: datamodels.Attribute, expression: Expr) -> None: + for part in expression.walk_values(): + if isinstance(part, Cast): + raise ValueError( + "DaCe backends are currently missing support for casts in variable k offsets. See issue https://github.com/GridTools/gt4py/issues/1881." + ) class IndexAccess(common.FieldAccess, Expr): diff --git a/src/gt4py/cartesian/gtc/dace/expansion/tasklet_codegen.py b/src/gt4py/cartesian/gtc/dace/expansion/tasklet_codegen.py index 8033c64710..2948b9d76d 100644 --- a/src/gt4py/cartesian/gtc/dace/expansion/tasklet_codegen.py +++ b/src/gt4py/cartesian/gtc/dace/expansion/tasklet_codegen.py @@ -44,7 +44,9 @@ def _visit_offset( else: int_sizes.append(None) sym_offsets = [ - dace.symbolic.pystr_to_symbolic(self.visit(off, **kwargs)) + dace.symbolic.pystr_to_symbolic( + self.visit(off, access_info=access_info, decl=decl, **kwargs) + ) for off in (node.to_dict()["i"], node.to_dict()["j"], node.k) ] for axis in access_info.variable_offset_axes: diff --git a/src/gt4py/next/program_processors/runners/dace/gtir_dataflow.py b/src/gt4py/next/program_processors/runners/dace/gtir_dataflow.py index e6f33208e3..43e7c6354d 100644 --- a/src/gt4py/next/program_processors/runners/dace/gtir_dataflow.py +++ b/src/gt4py/next/program_processors/runners/dace/gtir_dataflow.py @@ -232,15 +232,36 @@ def connect( dest: dace.nodes.AccessNode, subset: dace_subsets.Range, ) -> None: - # retrieve the node which writes the result - last_node = self.state.in_edges(self.result.dc_node)[0].src - if isinstance(last_node, dace.nodes.Tasklet): - # the last transient node can be deleted - # Note that it could also be applied when `last_node` is a NestedSDFG, - # but an exception would be when the inner write to global data is a - # WCR memlet, because that prevents fusion of the outer map. This case - # happens for the reduce with skip values, which uses a map with WCR. - last_node_connector = self.state.in_edges(self.result.dc_node)[0].src_conn + write_edge = self.state.in_edges(self.result.dc_node)[0] + write_size = write_edge.data.dst_subset.num_elements() + # check the kind of node which writes the result + if isinstance(write_edge.src, dace.nodes.Tasklet): + # The temporary data written by a tasklet can be safely deleted + assert write_size.is_constant() + remove_last_node = True + elif isinstance(write_edge.src, dace.nodes.NestedSDFG): + if write_size.is_constant(): + # Temporary data with compile-time size is allocated on the stack + # and therefore is safe to keep. We decide to keep it as a workaround + # for a dace issue with memlet propagation in combination with + # nested SDFGs containing conditional blocks. The output memlet + # of such blocks will be marked as dynamic because dace is not able + # to detect the exact size of a conditional branch dataflow, even + # in case of if-else expressions with exact same output data. + remove_last_node = False + else: + # In case the output data has runtime size it is necessary to remove + # it in order to avoid dynamic memory allocation inside a parallel + # map scope. Otherwise, the memory allocation will for sure lead + # to performance degradation, and eventually illegal memory issues + # when the gpu runs out of local memory. + remove_last_node = True + else: + remove_last_node = False + + if remove_last_node: + last_node = write_edge.src + last_node_connector = write_edge.src_conn self.state.remove_node(self.result.dc_node) else: last_node = self.result.dc_node diff --git a/tests/cartesian_tests/definitions.py b/tests/cartesian_tests/definitions.py index 7499ad4a95..4d52b9b773 100644 --- a/tests/cartesian_tests/definitions.py +++ b/tests/cartesian_tests/definitions.py @@ -51,6 +51,12 @@ def _get_backends_with_storage_info(storage_info_kind: str): _PERFORMANCE_BACKEND_NAMES = [name for name in _ALL_BACKEND_NAMES if name not in ("numpy", "cuda")] PERFORMANCE_BACKENDS = [_backend_name_as_param(name) for name in _PERFORMANCE_BACKEND_NAMES] +DACE_BACKENDS = [ + _backend_name_as_param(name) + for name in filter(lambda name: name.startswith("dace:"), _ALL_BACKEND_NAMES) +] +NON_DACE_BACKENDS = [backend for backend in ALL_BACKENDS if backend not in DACE_BACKENDS] + @pytest.fixture() def id_version(): diff --git a/tests/cartesian_tests/integration_tests/multi_feature_tests/test_code_generation.py b/tests/cartesian_tests/integration_tests/multi_feature_tests/test_code_generation.py index 8ace0de740..8e5f3466d0 100644 --- a/tests/cartesian_tests/integration_tests/multi_feature_tests/test_code_generation.py +++ b/tests/cartesian_tests/integration_tests/multi_feature_tests/test_code_generation.py @@ -27,7 +27,13 @@ ) from gt4py.storage.cartesian import utils as storage_utils -from cartesian_tests.definitions import ALL_BACKENDS, CPU_BACKENDS, get_array_library +from cartesian_tests.definitions import ( + ALL_BACKENDS, + CPU_BACKENDS, + DACE_BACKENDS, + NON_DACE_BACKENDS, + get_array_library, +) from cartesian_tests.integration_tests.multi_feature_tests.stencil_definitions import ( EXTERNALS_REGISTRY as externals_registry, REGISTRY as stencil_definitions, @@ -762,3 +768,89 @@ def test( out_arr = gt_storage.ones(backend=backend, shape=domain, dtype=np.float64) test(in_arr, out_arr) assert (out_arr[:, :, :] == 388.0).all() + + +@pytest.mark.parametrize("backend", NON_DACE_BACKENDS) +def test_cast_in_index(backend): + @gtscript.stencil(backend) + def cast_in_index( + in_field: Field[np.float64], i32: np.int32, i64: np.int64, out_field: Field[np.float64] + ): + """Simple copy stencil with forced cast in index calculation.""" + with computation(PARALLEL), interval(...): + out_field = in_field[0, 0, i32 - i64] + + +@pytest.mark.parametrize("backend", DACE_BACKENDS) +@pytest.mark.xfail(raises=ValueError) +def test_dace_no_cast_in_index(backend): + @gtscript.stencil(backend) + def cast_in_index( + in_field: Field[np.float64], i32: np.int32, i64: np.int64, out_field: Field[np.float64] + ): + """Simple copy stencil with forced cast in index calculation.""" + with computation(PARALLEL), interval(...): + out_field = in_field[0, 0, i32 - i64] + + +@pytest.mark.parametrize("backend", ALL_BACKENDS) +def test_read_after_write_stencil(backend): + """Stencil with multiple read after write access patterns.""" + + @gtscript.stencil(backend=backend) + def lagrangian_contributions( + q: Field[np.float64], + pe1: Field[np.float64], + pe2: Field[np.float64], + q4_1: Field[np.float64], + q4_2: Field[np.float64], + q4_3: Field[np.float64], + q4_4: Field[np.float64], + dp1: Field[np.float64], + lev: gtscript.Field[gtscript.IJ, np.int64], + ): + """ + Args: + q (out): + pe1 (in): + pe2 (in): + q4_1 (in): + q4_2 (in): + q4_3 (in): + q4_4 (in): + dp1 (in): + lev (inout): + """ + with computation(FORWARD), interval(...): + pl = (pe2 - pe1[0, 0, lev]) / dp1[0, 0, lev] + if pe2[0, 0, 1] <= pe1[0, 0, lev + 1]: + pr = (pe2[0, 0, 1] - pe1[0, 0, lev]) / dp1[0, 0, lev] + q = ( + q4_2[0, 0, lev] + + 0.5 * (q4_4[0, 0, lev] + q4_3[0, 0, lev] - q4_2[0, 0, lev]) * (pr + pl) + - q4_4[0, 0, lev] * 1.0 / 3.0 * (pr * (pr + pl) + pl * pl) + ) + else: + qsum = (pe1[0, 0, lev + 1] - pe2) * ( + q4_2[0, 0, lev] + + 0.5 * (q4_4[0, 0, lev] + q4_3[0, 0, lev] - q4_2[0, 0, lev]) * (1.0 + pl) + - q4_4[0, 0, lev] * 1.0 / 3.0 * (1.0 + pl * (1.0 + pl)) + ) + lev = lev + 1 + while pe1[0, 0, lev + 1] < pe2[0, 0, 1]: + qsum += dp1[0, 0, lev] * q4_1[0, 0, lev] + lev = lev + 1 + dp = pe2[0, 0, 1] - pe1[0, 0, lev] + esl = dp / dp1[0, 0, lev] + qsum += dp * ( + q4_2[0, 0, lev] + + 0.5 + * esl + * ( + q4_3[0, 0, lev] + - q4_2[0, 0, lev] + + q4_4[0, 0, lev] * (1.0 - (2.0 / 3.0) * esl) + ) + ) + q = qsum / (pe2[0, 0, 1] - pe2) + lev = lev - 1 diff --git a/tests/cartesian_tests/unit_tests/test_gtc/test_common.py b/tests/cartesian_tests/unit_tests/test_gtc/test_common.py index 68006c113b..4e799d2090 100644 --- a/tests/cartesian_tests/unit_tests/test_gtc/test_common.py +++ b/tests/cartesian_tests/unit_tests/test_gtc/test_common.py @@ -41,6 +41,24 @@ # - For testing non-leave nodes, introduce builders with defaults (for leave nodes as well) +def test_data_type_methods(): + for type in DataType: + if type == DataType.BOOL: + assert type.isbool() + else: + assert not type.isbool() + + if type in (DataType.INT8, DataType.INT16, DataType.INT32, DataType.INT64): + assert type.isinteger() + else: + assert not type.isinteger() + + if type in (DataType.FLOAT32, DataType.FLOAT64): + assert type.isfloat() + else: + assert not type.isfloat() + + class DummyExpr(Expr): """Fake expression for cases where a concrete expression is not needed.""" diff --git a/tests/next_tests/unit_tests/program_processor_tests/runners_tests/dace_tests/test_gtir_to_sdfg.py b/tests/next_tests/unit_tests/program_processor_tests/runners_tests/dace_tests/test_gtir_to_sdfg.py index 8ebb240339..030aa9b131 100644 --- a/tests/next_tests/unit_tests/program_processor_tests/runners_tests/dace_tests/test_gtir_to_sdfg.py +++ b/tests/next_tests/unit_tests/program_processor_tests/runners_tests/dace_tests/test_gtir_to_sdfg.py @@ -1846,7 +1846,7 @@ def test_gtir_let_lambda_with_tuple1(): sdfg = build_dace_sdfg(testee, CARTESIAN_OFFSETS) - z_fields = (np.empty_like(a), np.empty_like(a)) + z_fields = (np.zeros_like(a), np.zeros_like(a)) a_ref = np.concatenate((z_fields[0][:1], a[1 : N - 1], z_fields[0][N - 1 :])) b_ref = np.concatenate((z_fields[1][:1], b[1 : N - 1], z_fields[1][N - 1 :])) @@ -2037,7 +2037,7 @@ def test_gtir_index(): ], ) - v = np.empty(N, dtype=np.int32) + v = np.zeros(N, dtype=np.int32) # we need to run domain inference in order to add the domain annex information to the index node. testee = infer_domain.infer_program(testee, offset_provider=CARTESIAN_OFFSETS)