From 71f67383b3d74b62da99ef0f737853ddb44f0fd7 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Sun, 31 Oct 2021 16:54:49 -0500 Subject: [PATCH 1/3] Add a utility to get the instruction access map --- loopy/kernel/tools.py | 63 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 63 insertions(+) diff --git a/loopy/kernel/tools.py b/loopy/kernel/tools.py index 9a14aedd5..b3f39b4d8 100644 --- a/loopy/kernel/tools.py +++ b/loopy/kernel/tools.py @@ -2151,4 +2151,67 @@ def get_hw_axis_base_for_codegen(kernel: LoopKernel, iname: str) -> isl.Aff: constants_only=False) return lower_bound + +# {{{ get access map from an instruction + +class _IndexCollector(CombineMapper): + def __init__(self, var): + self.var = var + + def combine(self, values): + import operator + return reduce(operator.or_, values, frozenset()) + + def map_subscript(self, expr): + if expr.aggregate.name == self.var: + return (super().map_subscript(expr) | frozenset([expr.index_tuple])) + else: + return super().map_subscript(expr) + + def map_algebraic_leaf(self, expr): + return frozenset() + + map_constant = map_algebraic_leaf + + +def _project_out_inames_from_maps(amaps, inames_to_project_out): + new_amaps = [] + for amap in amaps: + for iname in inames_to_project_out: + dt, pos = amap.get_var_dict()[iname] + amap = amap.project_out(dt, pos, 1) + + new_amaps.append(amap) + + return new_amaps + + +def _union_amaps(amaps): + import islpy as isl + return reduce(isl.Map.union, amaps[1:], amaps[0]) + + +def get_insn_access_map(kernel, insn_id, var, inner_inames): + from loopy.match import Id + from loopy.symbolic import get_access_map + from loopy.transform.subst import expand_subst + + insn = kernel.id_to_insn[insn_id] + + kernel = expand_subst(kernel, within=Id(insn_id)) + indices = list(_IndexCollector(var)((insn.expression, + insn.assignees, + list(insn.predicates)))) + + amaps = _project_out_inames_from_maps( + [get_access_map(kernel.get_inames_domain(insn.within_inames), + idx, kernel.assumptions) + + for idx in indices], + inner_inames) + + return _union_amaps(amaps) + +# }}} + # vim: foldmethod=marker From def6bb13b1d72022d1d955d69b73aec85c299c04 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Sun, 31 Oct 2021 16:55:30 -0500 Subject: [PATCH 2/3] Implement loop fusion transformation --- doc/ref_transform.rst | 6 + loopy/__init__.py | 6 + loopy/transform/loop_fusion.py | 643 +++++++++++++++++++++++++++++++++ 3 files changed, 655 insertions(+) create mode 100644 loopy/transform/loop_fusion.py diff --git a/doc/ref_transform.rst b/doc/ref_transform.rst index 9ef012d66..3c209db9e 100644 --- a/doc/ref_transform.rst +++ b/doc/ref_transform.rst @@ -143,4 +143,10 @@ TODO: Matching instruction tags .. automodule:: loopy.match + +Fusing Loops +------------ + +.. automodule:: loopy.transform.loop_fusion + .. vim: tw=75:spell diff --git a/loopy/__init__.py b/loopy/__init__.py index 07f06a021..8eca3e6da 100644 --- a/loopy/__init__.py +++ b/loopy/__init__.py @@ -180,6 +180,10 @@ simplify_indices, tag_instructions, ) +from loopy.transform.loop_fusion import ( + get_kennedy_unweighted_fusion_candidates, + rename_inames_in_batch, +) from loopy.transform.pack_and_unpack_args import pack_and_unpack_args_for_call from loopy.transform.padding import ( add_padding, @@ -325,6 +329,7 @@ "get_dot_dependency_graph", "get_global_barrier_order", "get_iname_duplication_options", + "get_kennedy_unweighted_fusion_candidates", "get_mem_access_map", "get_one_linearized_kernel", "get_one_scheduled_kernel", @@ -371,6 +376,7 @@ "rename_callable", "rename_iname", "rename_inames", + "rename_inames_in_batch", "replace_instruction_ids", "save_and_reload_temporaries", "set_argument_order", diff --git a/loopy/transform/loop_fusion.py b/loopy/transform/loop_fusion.py new file mode 100644 index 000000000..a9252cfaa --- /dev/null +++ b/loopy/transform/loop_fusion.py @@ -0,0 +1,643 @@ +__copyright__ = """ +Copyright (C) 2021-24 Kaushik Kulkarni +""" + +__license__ = """ +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +""" + +from dataclasses import dataclass +from functools import reduce +from typing import Callable, Dict, FrozenSet, Mapping, Set, Tuple + +from loopy.diagnostic import LoopyError +from loopy.kernel import LoopKernel + + +__doc__ = """ +.. autofunction:: rename_inames_in_batch +.. autofunction:: get_kennedy_unweighted_fusion_candidates +""" + + +# {{{ Loop Dependence graph class + builder + + +@dataclass(frozen=True, eq=True) +class LoopDependenceGraph: + """ + .. attribute:: successors + + A mapping from iname (``i``) to the collection of inames that can be + scheduled only after the loop corresponding to ``i`` has been exited. + + .. attribute:: predecessors + + A mapping from iname (``i``) to the collection of inames that must have + been exited before entering ``i``. + + .. attribute:: is_infusible + + A mapping from the edges in the loop dependence graph to their + fusibility criterion. An edge in this mapping is represented by a pair + of inames``(iname_i, iname_j)`` such that the edge ``iname_i -> + iname_j`` is present in the graph. + + .. note:: + + Both :attr:`successors` and :attr:`predecessors` are maintained to + reduce the complexity of graph primitive operations (like remove node, + add edge, etc.). + """ + + successors: Mapping[str, FrozenSet[str]] + predecessors: Mapping[str, FrozenSet[str]] + is_infusible: Mapping[Tuple[str, str], bool] + + @classmethod + def new(cls, successors, is_infusible): + predecessors = {node: set() for node in successors} + for node, succs in successors.items(): + for succ in succs: + predecessors[succ].add(node) + + predecessors = { + node: frozenset(preds) for node, preds in predecessors.items() + } + successors = {node: frozenset(succs) for node, succs in successors.items()} + + return LoopDependenceGraph(successors, predecessors, is_infusible) + + def is_empty(self): + """ + Returns *True* only if the loop dependence graph contains no nodes. + """ + return len(self.successors) == 0 + + def get_loops_with_no_predecessors(self): + return { + loop for loop, preds in self.predecessors.items() if len(preds) == 0 + } + + def remove_nodes(self, nodes_to_remove): + """ + Returns a copy of *self* after removing *nodes_to_remove* in the graph. + This routine adds necessary edges after removing *nodes_to_remove* to + conserve the scheduling constraints present in the graph. + """ + # {{{ Step 1. Remove the nodes + + new_successors = { + node: succs + for node, succs in self.successors.items() + if node not in nodes_to_remove + } + new_predecessors = { + node: preds + for node, preds in self.predecessors.items() + if node not in nodes_to_remove + } + + new_is_infusible = { + (from_, to): v + for (from_, to), v in self.is_infusible.items() + if (from_ not in nodes_to_remove and to not in nodes_to_remove) + } + + # }}} + + # {{{ Step 2. Propagate dependencies + + # For every Node 'R' to be removed and every pair (S, P) such that + # 1. there exists an edge 'P' -> 'R' in the original graph, and, + # 2. there exits an edge 'R' -> 'S' in the original graph. + # add the edge 'P' -> 'S' in the new graph. + + for node_to_remove in nodes_to_remove: + for succ in self.successors[node_to_remove] - nodes_to_remove: + new_predecessors[succ] = new_predecessors[succ] - frozenset( + [node_to_remove] + ) + + for pred in self.predecessors[node_to_remove] - nodes_to_remove: + new_successors[pred] = new_successors[pred] - frozenset( + [node_to_remove] + ) + + # }}} + + return LoopDependenceGraph( + new_successors, new_predecessors, new_is_infusible + ) + + +@dataclass +class LoopDependenceGraphBuilder: + _dag: Dict[str, Set[str]] + _is_infusible: Mapping[Tuple[str, str], bool] + + @classmethod + def new(cls, candidates): + return LoopDependenceGraphBuilder( + {iname: set() for iname in candidates}, {} + ) + + def add_edge(self, from_: str, to: str, is_infusible: bool): + self._dag[from_].add(to) + self._is_infusible[(from_, to)] = is_infusible or self._is_infusible.get( + (from_, to), False + ) + + def done(self): + """ + Returns the built :class:`LoopDependenceGraph`. + """ + return LoopDependenceGraph.new(self._dag, self._is_infusible) + + +# }}} + + +def _remove_irrelevant_insns_from_statement_dag( + kernel: LoopKernel, + insn_to_predecessors: Mapping[str, FrozenSet[str]], + insn_to_successors: Mapping[str, FrozenSet[str]], + candidates: FrozenSet[str], +) -> Tuple[ + Mapping[str, FrozenSet[str]], + Mapping[str, FrozenSet[str]], + FrozenSet[Tuple[str, str]], +]: + """ + Removes instructions from the statement DAG represented by + *insn_to_predecessors*, *insn_to_successors* that are not nested in + *candidates*. + + Returns a new statement DAG ``new_predecessors, new_successors`` , where + edges are added between the remaining nodes of the statement DAG to + preserve the dependencies in the original DAG. + """ + # {{{ input validation + + assert set(insn_to_predecessors) == set(insn_to_successors) + assert all(isinstance(val, frozenset) for val in insn_to_predecessors.values()) + assert all(isinstance(val, frozenset) for val in insn_to_successors.values()) + + # }}} + + insns_to_remove = { + insn + for insn in insn_to_successors + if len(kernel.id_to_insn[insn].within_inames & candidates) == 0 + } + + new_predecessors = insn_to_predecessors.copy() + new_successors = insn_to_successors.copy() + infusible_edges_in_statement_dag = set() + + for insn_to_remove in insns_to_remove: + for pred in new_predecessors[insn_to_remove]: + new_successors[pred] = ( + new_successors[pred] - frozenset([insn_to_remove]) + ) | new_successors[insn_to_remove] + + for succ in new_successors[insn_to_remove]: + new_predecessors[succ] = ( + new_predecessors[succ] - frozenset([insn_to_remove]) + ) | new_predecessors[insn_to_remove] + + for pred in new_predecessors[insn_to_remove]: + for succ in new_successors[insn_to_remove]: + # now mark the edge from pred -> succ infusible iff both 'pred' and + # 'succ' are *not* in insns_to_remove + if (pred not in insns_to_remove) and (succ not in insns_to_remove): + infusible_edges_in_statement_dag.add((pred, succ)) + + del new_predecessors[insn_to_remove] + del new_successors[insn_to_remove] + + return ( + new_predecessors, + new_successors, + frozenset(infusible_edges_in_statement_dag), + ) + + +def _compute_isinfusible_via_access_map( + kernel, insn_pred, candidate_pred, insn_succ, candidate_succ, outer_inames, var +): + """ + Returns *True* if the inames *candidate_pred* and *candidate_succ* are fused then + that might lead to a loop carried dependency for *var*. + """ + import islpy as isl + import pymbolic.primitives as prim + + from loopy.diagnostic import UnableToDetermineAccessRangeError + from loopy.kernel.tools import get_insn_access_map + from loopy.symbolic import isl_set_from_expr + + inner_inames_pred = kernel.insn_inames(insn_pred) - ( + frozenset([candidate_pred]) | outer_inames + ) + + inner_inames_succ = kernel.insn_inames(insn_succ) - ( + frozenset([candidate_succ]) | outer_inames + ) + + try: + amap_pred = get_insn_access_map(kernel, insn_pred, var, inner_inames_pred) + amap_succ = get_insn_access_map(kernel, insn_succ, var, inner_inames_succ) + except UnableToDetermineAccessRangeError: + # either predecessors or successors has a non-affine access i.e. + # fallback to the safer option => infusible + return True + + # since both ranges denote the same variable they must be subscripted with + # the same number of indices. + assert amap_pred.dim(isl.dim_type.out) == amap_succ.dim(isl.dim_type.out) + + ndim = amap_pred.dim(isl.dim_type.out) + + # {{{ set the out dim names as `amap_a_dim0`, `amap_a_dim1`, ... + + for idim in range(ndim): + amap_pred = amap_pred.set_dim_name( + isl.dim_type.out, idim, f"_lpy_amap_a_dim{idim}" + ) + amap_succ = amap_succ.set_dim_name( + isl.dim_type.out, idim, f"_lpy_amap_b_dim{idim}" + ) + + # }}} + + # {{{ amap_pred -> set_pred, amap_succ -> set_succ + + amap_pred = amap_pred.move_dims( + isl.dim_type.in_, + amap_pred.dim(isl.dim_type.in_), + isl.dim_type.out, + 0, + amap_pred.dim(isl.dim_type.out), + ) + + amap_succ = amap_succ.move_dims( + isl.dim_type.in_, + amap_succ.dim(isl.dim_type.in_), + isl.dim_type.out, + 0, + amap_succ.dim(isl.dim_type.out), + ) + + set_pred, set_succ = amap_pred.domain(), amap_succ.domain() + set_pred, set_succ = isl.align_two(set_pred, set_succ) + + # }}} + + # {{{ build the bset, both accesses access the same element + + accesses_same_index_set = isl.BasicSet.universe(set_pred.space) + for idim in range(ndim): + cnstrnt = isl.Constraint.eq_from_names( + set_pred.space, + {f"_lpy_amap_a_dim{idim}": 1, f"_lpy_amap_b_dim{idim}": -1}, + ) + accesses_same_index_set = accesses_same_index_set.add_constraint(cnstrnt) + + # }}} + + candidates_not_equal = isl_set_from_expr( + set_pred.space, + prim.Comparison( + prim.Variable(candidate_pred), ">", prim.Variable(candidate_succ) + ), + ) + return not ( + set_pred & set_succ & accesses_same_index_set & candidates_not_equal + ).is_empty() + + +def _preprocess_deps( + kernel: LoopKernel, + deps: FrozenSet[str], + candidates: FrozenSet[str], + outer_inames: FrozenSet[str], +) -> FrozenSet[str]: + all_deps = set() + + for dep in deps: + if kernel.id_to_insn[dep].within_inames == outer_inames: + all_deps.add(dep) + elif kernel.id_to_insn[dep].within_inames & candidates: + all_deps.add(dep) + else: + all_deps |= reduce( + frozenset.intersection, + ( + kernel.iname_to_insns()[iname] + for iname in kernel.id_to_insn[dep].within_inames + ), + frozenset(kernel.id_to_insn), + ) + + return frozenset(all_deps) + + +def _build_ldg( + kernel: LoopKernel, candidates: FrozenSet[str], outer_inames: FrozenSet[str] +): + """ + Returns an instance of :class:`LoopDependenceGraph` needed while fusing + *candidates*. Invoked as a helper function in + :func:`get_kennedy_unweighted_fusion_candidates`. + """ + + from pytools.graph import compute_topological_order + + insns = reduce( + frozenset.intersection, + (frozenset(kernel.iname_to_insns()[iname]) for iname in outer_inames), + frozenset(kernel.id_to_insn), + ) + predecessors = { + insn: ( + _preprocess_deps( + kernel, + kernel.id_to_insn[insn].depends_on, + candidates=candidates, + outer_inames=outer_inames, + ) + & insns + ) + for insn in insns + } + successors = {insn: frozenset() for insn in insns} + + for insn, preds in predecessors.items(): + for pred in preds: + successors[pred] |= frozenset([insn]) + + predecessors, successors, infusible_edges = ( + _remove_irrelevant_insns_from_statement_dag( + kernel, predecessors, successors, candidates + ) + ) + + builder = LoopDependenceGraphBuilder.new(candidates) + + # Interpret the statement DAG as LDG + for pred, succs in successors.items(): + for succ in succs: + (succ_candidate,) = kernel.id_to_insn[succ].within_inames & candidates + (pred_candidate,) = kernel.id_to_insn[pred].within_inames & candidates + builder.add_edge( + pred_candidate, succ_candidate, (pred, succ) in infusible_edges + ) + + # {{{ add infusible edges to the LDG depending on memory deps. + + all_candidate_insns = reduce( + frozenset.union, + (kernel.iname_to_insns()[iname] for iname in candidates), + frozenset(), + ) + + dep_inducing_vars = reduce( + frozenset.union, + ( + frozenset(kernel.id_to_insn[insn].assignee_var_names()) + for insn in all_candidate_insns + ), + frozenset(), + ) + wmap = kernel.writer_map() + rmap = kernel.reader_map() + + topo_order = { + el: i for i, el in enumerate(compute_topological_order(successors)) + } + + for var in dep_inducing_vars: + for writer_id in wmap.get(var, frozenset()) & all_candidate_insns: + for access_id in ( + rmap.get(var, frozenset()) | wmap.get(var, frozenset()) + ) & all_candidate_insns: + if writer_id == access_id: + # no need to add self dependence + continue + + pred, succ = sorted([writer_id, access_id], key=topo_order.get) + (succ_candidate,) = ( + kernel.id_to_insn[succ].within_inames & candidates + ) + (pred_candidate,) = ( + kernel.id_to_insn[pred].within_inames & candidates + ) + + is_infusible = _compute_isinfusible_via_access_map( + kernel, + pred, + pred_candidate, + succ, + succ_candidate, + outer_inames, + var, + ) + + builder.add_edge(pred_candidate, succ_candidate, is_infusible) + + # }}} + + return builder.done() + + +def _fuse_sequential_loops_with_outer_loops( + kernel: LoopKernel, + candidates: FrozenSet[str], + outer_inames: FrozenSet[str], + name_gen: Callable[[str], str], + prefix: str, +): + ldg = _build_ldg(kernel, candidates, outer_inames) + + fused_chunks = {} + + while not ldg.is_empty(): + + # sorting to have a deterministic order. + queue = sorted(ldg.get_loops_with_no_predecessors()) + loops_to_be_fused = set() + non_fusible_loops = set() + while queue: + next_loop_in_queue = queue[0] + queue = queue[1:] + if not (ldg.predecessors[next_loop_in_queue] <= loops_to_be_fused): + # this loop still needs some other loops to be scheduled + # before we can reach this. + # Bye bye 'next_loop_in_queue' :'( , see you when all your + # predecessors have been scheduled. + continue + + if next_loop_in_queue in non_fusible_loops: + # had an non-fusible edge with an already schedule loop. + # Sorry 'next_loop_in_queue', until next time :'(. + continue + + loops_to_be_fused.add(next_loop_in_queue) + + for succ in ldg.successors[next_loop_in_queue]: + if ldg.is_infusible.get((next_loop_in_queue, succ), False): + non_fusible_loops.add(succ) + else: + queue.append(succ) + + ldg = ldg.remove_nodes(loops_to_be_fused) + fused_chunks[name_gen(prefix)] = loops_to_be_fused + + assert reduce(frozenset.union, fused_chunks.values(), frozenset()) == candidates + assert sum(len(val) for val in fused_chunks.values()) == len(candidates) + + return fused_chunks + + +def get_kennedy_unweighted_fusion_candidates( + kernel: LoopKernel, candidates: FrozenSet[str], prefix: str = "ifused" +) -> Mapping[str, FrozenSet[str]]: + """ + Returns the fusion candidates mapping that could be fed to + :func:`rename_inames_in_batch` similar to Ken Kennedy's Unweighted + Loop-Fusion Algorithm. + + .. attribute:: prefix + + Prefix for the fused inames. + """ + from loopy.kernel.data import ConcurrentTag + from loopy.schedule.tools import ( + _get_iname_to_tree_node_id_from_partial_loop_nest_tree, + get_partial_loop_nest_tree, + ) + + assert isinstance(kernel, LoopKernel) + assert isinstance(candidates, frozenset) + + vng = kernel.get_var_name_generator() + fused_chunks = {} + + # {{{ handle concurrent inames + + # filter out concurrent loops. + all_concurrent_tags = reduce( + frozenset.union, + (kernel.inames[iname].tags_of_type(ConcurrentTag) for iname in candidates), + frozenset(), + ) + + concurrent_tag_to_inames = {tag: set() for tag in all_concurrent_tags} + + for iname in candidates: + if kernel.inames[iname].tags_of_type(ConcurrentTag): + # since ConcurrentTag is a UniqueTag there must be exactly one of + # it. + (tag,) = kernel.tags_of_type(ConcurrentTag) + concurrent_tag_to_inames[tag].add(iname) + + for inames in concurrent_tag_to_inames.values(): + fused_chunks[vng(prefix)] = inames + candidates = candidates - inames + + # }}} + + tree = get_partial_loop_nest_tree(kernel) + iname_to_tree_node_id = _get_iname_to_tree_node_id_from_partial_loop_nest_tree( + tree + ) + + # {{{ sanitary checks + + _nest_tree_id_to_candidate = {} + + for iname in candidates: + loop_nest_tree_node_id = iname_to_tree_node_id[iname] + if loop_nest_tree_node_id not in _nest_tree_id_to_candidate: + _nest_tree_id_to_candidate[loop_nest_tree_node_id] = iname + else: + conflict_iname = _nest_tree_id_to_candidate[loop_nest_tree_node_id] + raise LoopyError( + f"'{iname}' and '{conflict_iname}' " + "cannot be fused as they can be nested " + "within one another." + ) + + for iname in candidates: + outer_loops = reduce( + frozenset.union, + tree.ancestors(iname_to_tree_node_id[iname]), + frozenset(), + ) + if outer_loops & candidates: + raise LoopyError( + f"Cannot fuse '{iname}' with" + f" '{outer_loops & candidates}' as they" + " maybe nesting within one another." + ) + + del _nest_tree_id_to_candidate + + # }}} + + # just_outer_loop_nest: mapping from loop nest to the candidates they + # contain + just_outer_loop_nest = { + tree.parent(iname_to_tree_node_id[iname]): set() for iname in candidates + } + + for iname in candidates: + just_outer_loop_nest[tree.parent(iname_to_tree_node_id[iname])].add(iname) + + for outer_inames, inames in just_outer_loop_nest.items(): + fused_chunks.update( + _fuse_sequential_loops_with_outer_loops( + kernel, frozenset(inames), outer_inames, vng, prefix + ) + ) + + return fused_chunks + + +def rename_inames_in_batch( + kernel: LoopKernel, batches: Mapping[str, FrozenSet[str]] +) -> LoopKernel: + """ + Returns a copy of *kernel* with inames renamed according to *batches*. + + :arg kernel: An instance of :class:`loopy.LoopKernel`. + :arg batches: A mapping from ``new_iname`` to a :class:`frozenset` of + inames that are to be renamed to ``new_iname``. + """ + from loopy.transform.iname import rename_iname + + for new_iname, candidates in batches.items(): + for iname in candidates: + kernel = rename_iname(kernel, iname, new_iname, existing_ok=True) + + return kernel + + +# vim: foldmethod=marker From a05f94df318bf5608b4519dd359fabbdfc03d73a Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Sun, 31 Oct 2021 16:56:41 -0500 Subject: [PATCH 3/3] Test loop fusion implementation --- test/test_loop_fusion.py | 276 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 276 insertions(+) create mode 100644 test/test_loop_fusion.py diff --git a/test/test_loop_fusion.py b/test/test_loop_fusion.py new file mode 100644 index 000000000..a155f2267 --- /dev/null +++ b/test/test_loop_fusion.py @@ -0,0 +1,276 @@ +__copyright__ = "Copyright (C) 2021 Kaushik Kulkarni" + +__license__ = """ +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +""" + +import logging +import sys + +import numpy as np + +import pyopencl as cl +import pyopencl.clmath # noqa +import pyopencl.clrandom # noqa + +import loopy as lp + + +logger = logging.getLogger(__name__) + +try: + import faulthandler +except ImportError: + pass +else: + faulthandler.enable() + +from pyopencl.tools import pytest_generate_tests_for_pyopencl as pytest_generate_tests + +from loopy.version import LOOPY_USE_LANGUAGE_VERSION_2018_2 # noqa + + +__all__ = [ + "cl", # "cl.create_some_context" + "pytest_generate_tests" +] + + +def test_loop_fusion_vanilla(ctx_factory): + ctx = ctx_factory() + + knl = lp.make_kernel( + "{[i0, i1, j0, j1]: 0 <= i0, i1, j0, j1 < 10}", + """ + a[i0] = 1 + b[i1, j0] = 2 {id=write_b} + c[j1] = 3 {id=write_c} + """) + ref_knl = knl + + fused_chunks = lp.get_kennedy_unweighted_fusion_candidates(knl["loopy_kernel"], + frozenset(["j0", "j1"])) + + knl = knl.with_kernel(lp.rename_inames_in_batch(knl["loopy_kernel"], + fused_chunks)) + assert len(ref_knl["loopy_kernel"].all_inames()) == 4 + assert len(knl["loopy_kernel"].all_inames()) == 3 + assert len(knl["loopy_kernel"].id_to_insn["write_b"].within_inames + & knl["loopy_kernel"].id_to_insn["write_c"].within_inames) == 1 + + lp.auto_test_vs_ref(ref_knl, ctx, knl) + + +def test_loop_fusion_outer_iname_preventing_fusion(ctx_factory): + ctx = ctx_factory() + + knl = lp.make_kernel( + "{[i0, j0, j1]: 0 <= i0, j0, j1 < 10}", + """ + a[i0] = 1 + b[i0, j0] = 2 {id=write_b} + c[j1] = 3 {id=write_c} + """) + ref_knl = knl + + fused_chunks = lp.get_kennedy_unweighted_fusion_candidates(knl["loopy_kernel"], + frozenset(["j0", "j1"])) + + knl = knl.with_kernel(lp.rename_inames_in_batch(knl["loopy_kernel"], + fused_chunks)) + + assert len(knl["loopy_kernel"].all_inames()) == 3 + assert len(knl["loopy_kernel"].all_inames()) == 3 + assert len(knl["loopy_kernel"].id_to_insn["write_b"].within_inames + & knl["loopy_kernel"].id_to_insn["write_c"].within_inames) == 0 + + lp.auto_test_vs_ref(ref_knl, ctx, knl) + + +def test_loop_fusion_with_loop_independent_deps(ctx_factory): + ctx = ctx_factory() + + knl = lp.make_kernel( + "{[j0, j1]: 0 <= j0, j1 < 10}", + """ + a[j0] = 1 + b[j1] = 2 * a[j1] + """, seq_dependencies=True) + + ref_knl = knl + + fused_chunks = lp.get_kennedy_unweighted_fusion_candidates(knl["loopy_kernel"], + frozenset(["j0", "j1"])) + + knl = knl.with_kernel(lp.rename_inames_in_batch(knl["loopy_kernel"], + fused_chunks)) + + assert len(ref_knl["loopy_kernel"].all_inames()) == 2 + assert len(knl["loopy_kernel"].all_inames()) == 1 + + lp.auto_test_vs_ref(ref_knl, ctx, knl) + + +def test_loop_fusion_constrained_by_outer_loop_deps(ctx_factory): + ctx = ctx_factory() + + knl = lp.make_kernel( + "{[j0, j1]: 0 <= j0, j1 < 10}", + """ + a[j0] = 1 {id=write_a} + b = 2 {id=write_b} + c[j1] = 2 * a[j1] {id=write_c} + """, seq_dependencies=True) + + ref_knl = knl + + fused_chunks = lp.get_kennedy_unweighted_fusion_candidates(knl["loopy_kernel"], + frozenset(["j0", "j1"])) + + knl = knl.with_kernel(lp.rename_inames_in_batch(knl["loopy_kernel"], + fused_chunks)) + + assert len(ref_knl["loopy_kernel"].all_inames()) == 2 + assert len(knl["loopy_kernel"].all_inames()) == 2 + assert len(knl["loopy_kernel"].id_to_insn["write_a"].within_inames + & knl["loopy_kernel"].id_to_insn["write_c"].within_inames) == 0 + + lp.auto_test_vs_ref(ref_knl, ctx, knl) + + +def test_loop_fusion_with_loop_carried_deps1(ctx_factory): + + ctx = ctx_factory() + knl = lp.make_kernel( + "{[i0, i1]: 1<=i0, i1<10}", + """ + x[i0] = i0 {id=first_write} + x[i1-1] = i1 ** 2 {id=second_write} + """, + seq_dependencies=True) + + ref_knl = knl + + fused_chunks = lp.get_kennedy_unweighted_fusion_candidates(knl["loopy_kernel"], + frozenset(["i0", + "i1"])) + + knl = knl.with_kernel(lp.rename_inames_in_batch(knl["loopy_kernel"], + fused_chunks)) + + assert len(ref_knl["loopy_kernel"].all_inames()) == 2 + assert len(knl["loopy_kernel"].all_inames()) == 1 + assert len(knl["loopy_kernel"].id_to_insn["first_write"].within_inames + & knl["loopy_kernel"].id_to_insn["second_write"].within_inames) == 1 + + lp.auto_test_vs_ref(ref_knl, ctx, knl) + + +def test_loop_fusion_with_loop_carried_deps2(ctx_factory): + ctx = ctx_factory() + knl = lp.make_kernel( + "{[i0, i1]: 1<=i0, i1<10}", + """ + x[i0-1] = i0 {id=first_write} + x[i1] = i1 ** 2 {id=second_write} + """, + seq_dependencies=True) + + ref_knl = knl + + fused_chunks = lp.get_kennedy_unweighted_fusion_candidates(knl["loopy_kernel"], + frozenset(["i0", + "i1"])) + + knl = knl.with_kernel(lp.rename_inames_in_batch(knl["loopy_kernel"], + fused_chunks)) + + assert len(ref_knl["loopy_kernel"].all_inames()) == 2 + assert len(knl["loopy_kernel"].all_inames()) == 2 + assert len(knl["loopy_kernel"].id_to_insn["first_write"].within_inames + & knl["loopy_kernel"].id_to_insn["second_write"].within_inames) == 0 + + lp.auto_test_vs_ref(ref_knl, ctx, knl) + + +def test_loop_fusion_with_indirection(ctx_factory): + ctx = ctx_factory() + map_ = np.random.permutation(10) + cq = cl.CommandQueue(ctx) + + knl = lp.make_kernel( + "{[i0, i1]: 0<=i0, i1<10}", + """ + x[i0] = i0 {id=first_write} + x[map[i1]] = i1 ** 2 {id=second_write} + """, + seq_dependencies=True) + + ref_knl = knl + + fused_chunks = lp.get_kennedy_unweighted_fusion_candidates(knl["loopy_kernel"], + frozenset(["i0", + "i1"])) + + knl = knl.with_kernel(lp.rename_inames_in_batch(knl["loopy_kernel"], + fused_chunks)) + + assert len(ref_knl["loopy_kernel"].all_inames()) == 2 + assert len(knl["loopy_kernel"].all_inames()) == 2 + assert len(knl["loopy_kernel"].id_to_insn["first_write"].within_inames + & knl["loopy_kernel"].id_to_insn["second_write"].within_inames) == 0 + + _, (out1,) = ref_knl(cq, map=map_) + _, (out2,) = knl(cq, map=map_) + np.testing.assert_allclose(out1, out2) + + +def test_loop_fusion_with_induced_dependencies_from_sibling_nests(ctx_factory): + ctx = ctx_factory() + t_unit = lp.make_kernel( + "{[i0, j, i1, i2]: 0<=i0, j, i1, i2<10}", + """ + <> tmp0[i0] = i0 + <> tmp1[j] = tmp0[j] + <> tmp2[j] = j + out1[i1] = tmp2[i1] + out2[i2] = 2 * tmp1[i2] + """) + ref_t_unit = t_unit + knl = t_unit.default_entrypoint + knl = lp.rename_inames_in_batch( + knl, + lp.get_kennedy_unweighted_fusion_candidates( + knl, frozenset(["i0", "i1"]))) + t_unit = t_unit.with_kernel(knl) + + # 'i1', 'i2' should not be fused. If fused that would lead to an + # unshcedulable kernel. Making sure that the kernel 'runs' suffices that + # the transformation was successful. + lp.auto_test_vs_ref(ref_t_unit, ctx, t_unit) + + +if __name__ == "__main__": + if len(sys.argv) > 1: + exec(sys.argv[1]) + else: + from pytest import main + main([__file__]) + +# vim: fdm=marker