diff --git a/test/test_loop_fusion.py b/test/test_loop_fusion.py new file mode 100644 index 000000000..678718295 --- /dev/null +++ b/test/test_loop_fusion.py @@ -0,0 +1,422 @@ +__copyright__ = "Copyright (C) 2021 Kaushik Kulkarni" + +__license__ = """ +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +""" + +import sys +import numpy as np +import loopy as lp +import pyopencl as cl +import pyopencl.clmath # noqa +import pyopencl.clrandom # noqa + +import logging +logger = logging.getLogger(__name__) + +try: + import faulthandler +except ImportError: + pass +else: + faulthandler.enable() + +from pyopencl.tools import pytest_generate_tests_for_pyopencl \ + as pytest_generate_tests + +from loopy.version import LOOPY_USE_LANGUAGE_VERSION_2018_2 # noqa + +__all__ = [ + "pytest_generate_tests", + "cl" # "cl.create_some_context" + ] + + +def test_loop_fusion_vanilla(ctx_factory): + ctx = ctx_factory() + + knl = lp.make_kernel( + "{[i0, i1, j0, j1]: 0 <= i0, i1, j0, j1 < 10}", + """ + a[i0] = 1 + b[i1, j0] = 2 {id=write_b} + c[j1] = 3 {id=write_c} + """) + ref_knl = knl + + fused_chunks = lp.get_kennedy_unweighted_fusion_candidates(knl["loopy_kernel"], + frozenset(["j0", "j1"])) + + knl = knl.with_kernel(lp.rename_inames_in_batch(knl["loopy_kernel"], + fused_chunks)) + assert len(ref_knl["loopy_kernel"].all_inames()) == 4 + assert len(knl["loopy_kernel"].all_inames()) == 3 + assert len(knl["loopy_kernel"].id_to_insn["write_b"].within_inames + & knl["loopy_kernel"].id_to_insn["write_c"].within_inames) == 1 + + lp.auto_test_vs_ref(ref_knl, ctx, knl) + + +def test_loop_fusion_outer_iname_preventing_fusion(ctx_factory): + ctx = ctx_factory() + + knl = lp.make_kernel( + "{[i0, j0, j1]: 0 <= i0, j0, j1 < 10}", + """ + a[i0] = 1 + b[i0, j0] = 2 {id=write_b} + c[j1] = 3 {id=write_c} + """) + ref_knl = knl + + fused_chunks = lp.get_kennedy_unweighted_fusion_candidates(knl["loopy_kernel"], + frozenset(["j0", "j1"])) + + knl = knl.with_kernel(lp.rename_inames_in_batch(knl["loopy_kernel"], + fused_chunks)) + + assert len(knl["loopy_kernel"].all_inames()) == 3 + assert len(knl["loopy_kernel"].all_inames()) == 3 + assert len(knl["loopy_kernel"].id_to_insn["write_b"].within_inames + & knl["loopy_kernel"].id_to_insn["write_c"].within_inames) == 0 + + lp.auto_test_vs_ref(ref_knl, ctx, knl) + + +def test_loop_fusion_with_loop_independent_deps(ctx_factory): + ctx = ctx_factory() + + knl = lp.make_kernel( + "{[j0, j1]: 0 <= j0, j1 < 10}", + """ + a[j0] = 1 + b[j1] = 2 * a[j1] + """, seq_dependencies=True) + + ref_knl = knl + + fused_chunks = lp.get_kennedy_unweighted_fusion_candidates(knl["loopy_kernel"], + frozenset(["j0", "j1"])) + + knl = knl.with_kernel(lp.rename_inames_in_batch(knl["loopy_kernel"], + fused_chunks)) + + assert len(ref_knl["loopy_kernel"].all_inames()) == 2 + assert len(knl["loopy_kernel"].all_inames()) == 1 + + lp.auto_test_vs_ref(ref_knl, ctx, knl) + + +def test_loop_fusion_constrained_by_outer_loop_deps(ctx_factory): + ctx = ctx_factory() + + knl = lp.make_kernel( + "{[j0, j1]: 0 <= j0, j1 < 10}", + """ + a[j0] = 1 {id=write_a} + b = 2 {id=write_b} + c[j1] = 2 * a[j1] {id=write_c} + """, seq_dependencies=True) + + ref_knl = knl + + fused_chunks = lp.get_kennedy_unweighted_fusion_candidates(knl["loopy_kernel"], + frozenset(["j0", "j1"])) + + knl = knl.with_kernel(lp.rename_inames_in_batch(knl["loopy_kernel"], + fused_chunks)) + + assert len(ref_knl["loopy_kernel"].all_inames()) == 2 + assert len(knl["loopy_kernel"].all_inames()) == 2 + assert len(knl["loopy_kernel"].id_to_insn["write_a"].within_inames + & knl["loopy_kernel"].id_to_insn["write_c"].within_inames) == 0 + + lp.auto_test_vs_ref(ref_knl, ctx, knl) + + +def test_loop_fusion_with_loop_carried_deps1(ctx_factory): + + ctx = ctx_factory() + knl = lp.make_kernel( + "{[i0, i1]: 1<=i0, i1<10}", + """ + x[i0] = i0 {id=first_write} + x[i1-1] = i1 ** 2 {id=second_write} + """, + seq_dependencies=True) + + ref_knl = knl + + fused_chunks = lp.get_kennedy_unweighted_fusion_candidates(knl["loopy_kernel"], + frozenset(["i0", + "i1"])) + + knl = knl.with_kernel(lp.rename_inames_in_batch(knl["loopy_kernel"], + fused_chunks)) + + assert len(ref_knl["loopy_kernel"].all_inames()) == 2 + assert len(knl["loopy_kernel"].all_inames()) == 1 + assert len(knl["loopy_kernel"].id_to_insn["first_write"].within_inames + & knl["loopy_kernel"].id_to_insn["second_write"].within_inames) == 1 + + lp.auto_test_vs_ref(ref_knl, ctx, knl) + + +def test_loop_fusion_with_loop_carried_deps2(ctx_factory): + ctx = ctx_factory() + knl = lp.make_kernel( + "{[i0, i1]: 1<=i0, i1<10}", + """ + x[i0-1] = i0 {id=first_write} + x[i1] = i1 ** 2 {id=second_write} + """, + seq_dependencies=True) + + ref_knl = knl + + fused_chunks = lp.get_kennedy_unweighted_fusion_candidates(knl["loopy_kernel"], + frozenset(["i0", + "i1"])) + + knl = knl.with_kernel(lp.rename_inames_in_batch(knl["loopy_kernel"], + fused_chunks)) + + assert len(ref_knl["loopy_kernel"].all_inames()) == 2 + assert len(knl["loopy_kernel"].all_inames()) == 2 + assert len(knl["loopy_kernel"].id_to_insn["first_write"].within_inames + & knl["loopy_kernel"].id_to_insn["second_write"].within_inames) == 0 + + lp.auto_test_vs_ref(ref_knl, ctx, knl) + + +def test_loop_fusion_with_indirection(ctx_factory): + ctx = ctx_factory() + map_ = np.random.permutation(10) + cq = cl.CommandQueue(ctx) + + knl = lp.make_kernel( + "{[i0, i1]: 0<=i0, i1<10}", + """ + x[i0] = i0 {id=first_write} + x[map[i1]] = i1 ** 2 {id=second_write} + """, + seq_dependencies=True) + + ref_knl = knl + + fused_chunks = lp.get_kennedy_unweighted_fusion_candidates(knl["loopy_kernel"], + frozenset(["i0", + "i1"])) + + knl = knl.with_kernel(lp.rename_inames_in_batch(knl["loopy_kernel"], + fused_chunks)) + + assert len(ref_knl["loopy_kernel"].all_inames()) == 2 + assert len(knl["loopy_kernel"].all_inames()) == 2 + assert len(knl["loopy_kernel"].id_to_insn["first_write"].within_inames + & knl["loopy_kernel"].id_to_insn["second_write"].within_inames) == 0 + + _, (out1,) = ref_knl(cq, map=map_) + _, (out2,) = knl(cq, map=map_) + np.testing.assert_allclose(out1, out2) + + +def test_loop_fusion_with_induced_dependencies_from_sibling_nests(ctx_factory): + ctx = ctx_factory() + t_unit = lp.make_kernel( + "{[i0, j, i1, i2]: 0<=i0, j, i1, i2<10}", + """ + <> tmp0[i0] = i0 + <> tmp1[j] = tmp0[j] + <> tmp2[j] = j + out1[i1] = tmp2[i1] + out2[i2] = 2 * tmp1[i2] + """) + ref_t_unit = t_unit + knl = t_unit.default_entrypoint + knl = lp.rename_inames_in_batch( + knl, + lp.get_kennedy_unweighted_fusion_candidates( + knl, frozenset(["i0", "i1"]))) + t_unit = t_unit.with_kernel(knl) + + # 'i1', 'i2' should not be fused. If fused that would lead to an + # unshcedulable kernel. Making sure that the kernel 'runs' suffices that + # the transformation was successful. + lp.auto_test_vs_ref(ref_t_unit, ctx, t_unit) + + +def test_loop_fusion_on_reduction_inames(ctx_factory): + ctx = ctx_factory() + + t_unit = lp.make_kernel( + "{[i, j0, j1, j2]: 0<=i, j0, j1, j2<10}", + """ + y0[i] = sum(j0, sum([j1], 2*A[i, j0, j1])) + y1[i] = sum(j0, sum([j2], 3*A[i, j0, j2])) + """, [lp.GlobalArg("A", + dtype=np.float64, + shape=lp.auto), ...]) + ref_t_unit = t_unit + knl = t_unit.default_entrypoint + knl = lp.rename_inames_in_batch( + knl, + lp.get_kennedy_unweighted_fusion_candidates( + knl, frozenset(["j1", "j2"]))) + assert (knl.id_to_insn["insn"].reduction_inames() + == knl.id_to_insn["insn_0"].reduction_inames()) + + t_unit = t_unit.with_kernel(knl) + lp.auto_test_vs_ref(ref_t_unit, ctx, t_unit) + + +def test_loop_fusion_on_reduction_inames_with_depth_mismatch(ctx_factory): + ctx = ctx_factory() + + t_unit = lp.make_kernel( + "{[i, j0, j1, j2, j3]: 0<=i, j0, j1, j2, j3<10}", + """ + y0[i] = sum(j0, sum([j1], 2*A[i, j0, j1])) + y1[i] = sum(j2, sum([j3], 3*A[i, j3, j2])) + """, [lp.GlobalArg("A", + dtype=np.float64, + shape=lp.auto), + ...]) + ref_t_unit = t_unit + knl = t_unit.default_entrypoint + knl = lp.rename_inames_in_batch( + knl, + lp.get_kennedy_unweighted_fusion_candidates( + knl, frozenset(["j1", "j3"]))) + + # cannot fuse 'j1', 'j3' because they are not nested within the same outer + # inames. + assert (knl.id_to_insn["insn"].reduction_inames() + != knl.id_to_insn["insn_0"].reduction_inames()) + + t_unit = t_unit.with_kernel(knl) + lp.auto_test_vs_ref(ref_t_unit, ctx, t_unit) + + +def test_loop_fusion_on_outer_reduction_inames(ctx_factory): + ctx = ctx_factory() + + t_unit = lp.make_kernel( + "{[i, j0, j1, j2, j3]: 0<=i, j0, j1, j2, j3<10}", + """ + y0[i] = sum(j0, sum([j1], 2*A[i, j0, j1])) + y1[i] = sum(j2, sum([j3], 3*A[i, j3, j2])) + """, [lp.GlobalArg("A", + dtype=np.float64, + shape=lp.auto), + ...]) + ref_t_unit = t_unit + knl = t_unit.default_entrypoint + knl = lp.rename_inames_in_batch( + knl, + lp.get_kennedy_unweighted_fusion_candidates( + knl, frozenset(["j0", "j2"]))) + + assert len(knl.id_to_insn["insn"].reduction_inames() + & knl.id_to_insn["insn_0"].reduction_inames()) == 1 + + t_unit = t_unit.with_kernel(knl) + lp.auto_test_vs_ref(ref_t_unit, ctx, t_unit) + + +def test_loop_fusion_reduction_inames_simple(ctx_factory): + ctx = ctx_factory() + + t_unit = lp.make_kernel( + "{[i, j0, j1]: 0<=i, j0, j1<10}", + """ + y0[i] = sum(j0, 2*A[i, j0]) + y1[i] = sum(j1, 3*A[i, j1]) + """, [lp.GlobalArg("A", + dtype=np.float64, + shape=lp.auto), + ...]) + ref_t_unit = t_unit + knl = t_unit.default_entrypoint + knl = lp.rename_inames_in_batch( + knl, + lp.get_kennedy_unweighted_fusion_candidates( + knl, frozenset(["j0", "j1"]))) + + assert (knl.id_to_insn["insn"].reduction_inames() + == knl.id_to_insn["insn_0"].reduction_inames()) + + t_unit = t_unit.with_kernel(knl) + lp.auto_test_vs_ref(ref_t_unit, ctx, t_unit) + + +def test_redn_loop_fusion_with_non_candidates_loops_in_nest(ctx_factory): + ctx = ctx_factory() + t_unit = lp.make_kernel( + "{[i, j1, j2, d]: 0<=i, j1, j2, d<10}", + """ + for i + for d + out1[i, d] = sum(j1, 2 * j1*i) + end + out2[i] = sum(j2, 2 * j2) + end + """, seq_dependencies=True) + ref_t_unit = t_unit + + knl = t_unit.default_entrypoint + knl = lp.rename_inames_in_batch( + knl, + lp.get_kennedy_unweighted_fusion_candidates( + knl, frozenset(["j1", "j2"]))) + + assert not (knl.id_to_insn["insn"].reduction_inames() + & knl.id_to_insn["insn_0"].reduction_inames()) + + lp.auto_test_vs_ref(ref_t_unit, ctx, t_unit.with_kernel(knl)) + + +def test_reduction_loop_fusion_with_multiple_redn_in_same_insn(ctx_factory): + ctx = ctx_factory() + t_unit = lp.make_kernel( + "{[j1, j2]: 0<=j1, j2<10}", + """ + out = sum(j1, 2*j1) + sum(j2, 2*j2) + """, seq_dependencies=True) + ref_t_unit = t_unit + + knl = t_unit.default_entrypoint + knl = lp.rename_inames_in_batch( + knl, + lp.get_kennedy_unweighted_fusion_candidates( + knl, frozenset(["j1", "j2"]))) + + assert len(knl.id_to_insn["insn"].reduction_inames()) == 1 + + lp.auto_test_vs_ref(ref_t_unit, ctx, t_unit.with_kernel(knl)) + + +if __name__ == "__main__": + if len(sys.argv) > 1: + exec(sys.argv[1]) + else: + from pytest import main + main([__file__]) + +# vim: fdm=marker