diff --git a/test/test_loop_fusion.py b/test/test_loop_fusion.py new file mode 100644 index 000000000..1d39867ea --- /dev/null +++ b/test/test_loop_fusion.py @@ -0,0 +1,272 @@ +__copyright__ = "Copyright (C) 2021 Kaushik Kulkarni" + +__license__ = """ +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +""" + +import sys +import numpy as np +import loopy as lp +import pyopencl as cl +import pyopencl.clmath # noqa +import pyopencl.clrandom # noqa + +import logging +logger = logging.getLogger(__name__) + +try: + import faulthandler +except ImportError: + pass +else: + faulthandler.enable() + +from pyopencl.tools import pytest_generate_tests_for_pyopencl \ + as pytest_generate_tests + +from loopy.version import LOOPY_USE_LANGUAGE_VERSION_2018_2 # noqa + +__all__ = [ + "pytest_generate_tests", + "cl" # "cl.create_some_context" + ] + + +def test_loop_fusion_vanilla(ctx_factory): + ctx = ctx_factory() + + knl = lp.make_kernel( + "{[i0, i1, j0, j1]: 0 <= i0, i1, j0, j1 < 10}", + """ + a[i0] = 1 + b[i1, j0] = 2 {id=write_b} + c[j1] = 3 {id=write_c} + """) + ref_knl = knl + + fused_chunks = lp.get_kennedy_unweighted_fusion_candidates(knl["loopy_kernel"], + frozenset(["j0", "j1"])) + + knl = knl.with_kernel(lp.rename_inames_in_batch(knl["loopy_kernel"], + fused_chunks)) + assert len(ref_knl["loopy_kernel"].all_inames()) == 4 + assert len(knl["loopy_kernel"].all_inames()) == 3 + assert len(knl["loopy_kernel"].id_to_insn["write_b"].within_inames + & knl["loopy_kernel"].id_to_insn["write_c"].within_inames) == 1 + + lp.auto_test_vs_ref(ref_knl, ctx, knl) + + +def test_loop_fusion_outer_iname_preventing_fusion(ctx_factory): + ctx = ctx_factory() + + knl = lp.make_kernel( + "{[i0, j0, j1]: 0 <= i0, j0, j1 < 10}", + """ + a[i0] = 1 + b[i0, j0] = 2 {id=write_b} + c[j1] = 3 {id=write_c} + """) + ref_knl = knl + + fused_chunks = lp.get_kennedy_unweighted_fusion_candidates(knl["loopy_kernel"], + frozenset(["j0", "j1"])) + + knl = knl.with_kernel(lp.rename_inames_in_batch(knl["loopy_kernel"], + fused_chunks)) + + assert len(knl["loopy_kernel"].all_inames()) == 3 + assert len(knl["loopy_kernel"].all_inames()) == 3 + assert len(knl["loopy_kernel"].id_to_insn["write_b"].within_inames + & knl["loopy_kernel"].id_to_insn["write_c"].within_inames) == 0 + + lp.auto_test_vs_ref(ref_knl, ctx, knl) + + +def test_loop_fusion_with_loop_independent_deps(ctx_factory): + ctx = ctx_factory() + + knl = lp.make_kernel( + "{[j0, j1]: 0 <= j0, j1 < 10}", + """ + a[j0] = 1 + b[j1] = 2 * a[j1] + """, seq_dependencies=True) + + ref_knl = knl + + fused_chunks = lp.get_kennedy_unweighted_fusion_candidates(knl["loopy_kernel"], + frozenset(["j0", "j1"])) + + knl = knl.with_kernel(lp.rename_inames_in_batch(knl["loopy_kernel"], + fused_chunks)) + + assert len(ref_knl["loopy_kernel"].all_inames()) == 2 + assert len(knl["loopy_kernel"].all_inames()) == 1 + + lp.auto_test_vs_ref(ref_knl, ctx, knl) + + +def test_loop_fusion_constrained_by_outer_loop_deps(ctx_factory): + ctx = ctx_factory() + + knl = lp.make_kernel( + "{[j0, j1]: 0 <= j0, j1 < 10}", + """ + a[j0] = 1 {id=write_a} + b = 2 {id=write_b} + c[j1] = 2 * a[j1] {id=write_c} + """, seq_dependencies=True) + + ref_knl = knl + + fused_chunks = lp.get_kennedy_unweighted_fusion_candidates(knl["loopy_kernel"], + frozenset(["j0", "j1"])) + + knl = knl.with_kernel(lp.rename_inames_in_batch(knl["loopy_kernel"], + fused_chunks)) + + assert len(ref_knl["loopy_kernel"].all_inames()) == 2 + assert len(knl["loopy_kernel"].all_inames()) == 2 + assert len(knl["loopy_kernel"].id_to_insn["write_a"].within_inames + & knl["loopy_kernel"].id_to_insn["write_c"].within_inames) == 0 + + lp.auto_test_vs_ref(ref_knl, ctx, knl) + + +def test_loop_fusion_with_loop_carried_deps1(ctx_factory): + + ctx = ctx_factory() + knl = lp.make_kernel( + "{[i0, i1]: 1<=i0, i1<10}", + """ + x[i0] = i0 {id=first_write} + x[i1-1] = i1 ** 2 {id=second_write} + """, + seq_dependencies=True) + + ref_knl = knl + + fused_chunks = lp.get_kennedy_unweighted_fusion_candidates(knl["loopy_kernel"], + frozenset(["i0", + "i1"])) + + knl = knl.with_kernel(lp.rename_inames_in_batch(knl["loopy_kernel"], + fused_chunks)) + + assert len(ref_knl["loopy_kernel"].all_inames()) == 2 + assert len(knl["loopy_kernel"].all_inames()) == 1 + assert len(knl["loopy_kernel"].id_to_insn["first_write"].within_inames + & knl["loopy_kernel"].id_to_insn["second_write"].within_inames) == 1 + + lp.auto_test_vs_ref(ref_knl, ctx, knl) + + +def test_loop_fusion_with_loop_carried_deps2(ctx_factory): + ctx = ctx_factory() + knl = lp.make_kernel( + "{[i0, i1]: 1<=i0, i1<10}", + """ + x[i0-1] = i0 {id=first_write} + x[i1] = i1 ** 2 {id=second_write} + """, + seq_dependencies=True) + + ref_knl = knl + + fused_chunks = lp.get_kennedy_unweighted_fusion_candidates(knl["loopy_kernel"], + frozenset(["i0", + "i1"])) + + knl = knl.with_kernel(lp.rename_inames_in_batch(knl["loopy_kernel"], + fused_chunks)) + + assert len(ref_knl["loopy_kernel"].all_inames()) == 2 + assert len(knl["loopy_kernel"].all_inames()) == 2 + assert len(knl["loopy_kernel"].id_to_insn["first_write"].within_inames + & knl["loopy_kernel"].id_to_insn["second_write"].within_inames) == 0 + + lp.auto_test_vs_ref(ref_knl, ctx, knl) + + +def test_loop_fusion_with_indirection(ctx_factory): + ctx = ctx_factory() + map_ = np.random.permutation(10) + cq = cl.CommandQueue(ctx) + + knl = lp.make_kernel( + "{[i0, i1]: 0<=i0, i1<10}", + """ + x[i0] = i0 {id=first_write} + x[map[i1]] = i1 ** 2 {id=second_write} + """, + seq_dependencies=True) + + ref_knl = knl + + fused_chunks = lp.get_kennedy_unweighted_fusion_candidates(knl["loopy_kernel"], + frozenset(["i0", + "i1"])) + + knl = knl.with_kernel(lp.rename_inames_in_batch(knl["loopy_kernel"], + fused_chunks)) + + assert len(ref_knl["loopy_kernel"].all_inames()) == 2 + assert len(knl["loopy_kernel"].all_inames()) == 2 + assert len(knl["loopy_kernel"].id_to_insn["first_write"].within_inames + & knl["loopy_kernel"].id_to_insn["second_write"].within_inames) == 0 + + _, (out1,) = ref_knl(cq, map=map_) + _, (out2,) = knl(cq, map=map_) + np.testing.assert_allclose(out1, out2) + + +def test_loop_fusion_with_induced_dependencies_from_sibling_nests(ctx_factory): + ctx = ctx_factory() + t_unit = lp.make_kernel( + "{[i0, j, i1, i2]: 0<=i0, j, i1, i2<10}", + """ + <> tmp0[i0] = i0 + <> tmp1[j] = tmp0[j] + <> tmp2[j] = j + out1[i1] = tmp2[i1] + out2[i2] = 2 * tmp1[i2] + """) + ref_t_unit = t_unit + knl = t_unit.default_entrypoint + knl = lp.rename_inames_in_batch( + knl, + lp.get_kennedy_unweighted_fusion_candidates( + knl, frozenset(["i0", "i1"]))) + t_unit = t_unit.with_kernel(knl) + + # 'i1', 'i2' should not be fused. If fused that would lead to an + # unshcedulable kernel. Making sure that the kernel 'runs' suffices that + # the transformation was successful. + lp.auto_test_vs_ref(ref_t_unit, ctx, t_unit) + + +if __name__ == "__main__": + if len(sys.argv) > 1: + exec(sys.argv[1]) + else: + from pytest import main + main([__file__]) + +# vim: fdm=marker