From 71f67383b3d74b62da99ef0f737853ddb44f0fd7 Mon Sep 17 00:00:00 2001
From: Kaushik Kulkarni <kaushikcfd@gmail.com>
Date: Sun, 31 Oct 2021 16:54:49 -0500
Subject: [PATCH 1/3] Add a utility to get the instruction access map

---
 loopy/kernel/tools.py | 63 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 63 insertions(+)

diff --git a/loopy/kernel/tools.py b/loopy/kernel/tools.py
index 9a14aedd5..b3f39b4d8 100644
--- a/loopy/kernel/tools.py
+++ b/loopy/kernel/tools.py
@@ -2151,4 +2151,67 @@ def get_hw_axis_base_for_codegen(kernel: LoopKernel, iname: str) -> isl.Aff:
                                        constants_only=False)
     return lower_bound
 
+
+# {{{ get access map from an instruction
+
+class _IndexCollector(CombineMapper):
+    def __init__(self, var):
+        self.var = var
+
+    def combine(self, values):
+        import operator
+        return reduce(operator.or_, values, frozenset())
+
+    def map_subscript(self, expr):
+        if expr.aggregate.name == self.var:
+            return (super().map_subscript(expr) | frozenset([expr.index_tuple]))
+        else:
+            return super().map_subscript(expr)
+
+    def map_algebraic_leaf(self, expr):
+        return frozenset()
+
+    map_constant = map_algebraic_leaf
+
+
+def _project_out_inames_from_maps(amaps, inames_to_project_out):
+    new_amaps = []
+    for amap in amaps:
+        for iname in inames_to_project_out:
+            dt, pos = amap.get_var_dict()[iname]
+            amap = amap.project_out(dt, pos, 1)
+
+        new_amaps.append(amap)
+
+    return new_amaps
+
+
+def _union_amaps(amaps):
+    import islpy as isl
+    return reduce(isl.Map.union, amaps[1:], amaps[0])
+
+
+def get_insn_access_map(kernel, insn_id, var, inner_inames):
+    from loopy.match import Id
+    from loopy.symbolic import get_access_map
+    from loopy.transform.subst import expand_subst
+
+    insn = kernel.id_to_insn[insn_id]
+
+    kernel = expand_subst(kernel, within=Id(insn_id))
+    indices = list(_IndexCollector(var)((insn.expression,
+                                         insn.assignees,
+                                         list(insn.predicates))))
+
+    amaps = _project_out_inames_from_maps(
+        [get_access_map(kernel.get_inames_domain(insn.within_inames),
+                        idx, kernel.assumptions)
+
+         for idx in indices],
+        inner_inames)
+
+    return _union_amaps(amaps)
+
+# }}}
+
 # vim: foldmethod=marker

From def6bb13b1d72022d1d955d69b73aec85c299c04 Mon Sep 17 00:00:00 2001
From: Kaushik Kulkarni <kaushikcfd@gmail.com>
Date: Sun, 31 Oct 2021 16:55:30 -0500
Subject: [PATCH 2/3] Implement loop fusion transformation

---
 doc/ref_transform.rst          |   6 +
 loopy/__init__.py              |   6 +
 loopy/transform/loop_fusion.py | 643 +++++++++++++++++++++++++++++++++
 3 files changed, 655 insertions(+)
 create mode 100644 loopy/transform/loop_fusion.py

diff --git a/doc/ref_transform.rst b/doc/ref_transform.rst
index 9ef012d66..3c209db9e 100644
--- a/doc/ref_transform.rst
+++ b/doc/ref_transform.rst
@@ -143,4 +143,10 @@ TODO: Matching instruction tags
 
 .. automodule:: loopy.match
 
+
+Fusing Loops
+------------
+
+.. automodule:: loopy.transform.loop_fusion
+
 .. vim: tw=75:spell
diff --git a/loopy/__init__.py b/loopy/__init__.py
index 07f06a021..8eca3e6da 100644
--- a/loopy/__init__.py
+++ b/loopy/__init__.py
@@ -180,6 +180,10 @@
     simplify_indices,
     tag_instructions,
 )
+from loopy.transform.loop_fusion import (
+    get_kennedy_unweighted_fusion_candidates,
+    rename_inames_in_batch,
+)
 from loopy.transform.pack_and_unpack_args import pack_and_unpack_args_for_call
 from loopy.transform.padding import (
     add_padding,
@@ -325,6 +329,7 @@
     "get_dot_dependency_graph",
     "get_global_barrier_order",
     "get_iname_duplication_options",
+    "get_kennedy_unweighted_fusion_candidates",
     "get_mem_access_map",
     "get_one_linearized_kernel",
     "get_one_scheduled_kernel",
@@ -371,6 +376,7 @@
     "rename_callable",
     "rename_iname",
     "rename_inames",
+    "rename_inames_in_batch",
     "replace_instruction_ids",
     "save_and_reload_temporaries",
     "set_argument_order",
diff --git a/loopy/transform/loop_fusion.py b/loopy/transform/loop_fusion.py
new file mode 100644
index 000000000..a9252cfaa
--- /dev/null
+++ b/loopy/transform/loop_fusion.py
@@ -0,0 +1,643 @@
+__copyright__ = """
+Copyright (C) 2021-24 Kaushik Kulkarni
+"""
+
+__license__ = """
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+"""
+
+from dataclasses import dataclass
+from functools import reduce
+from typing import Callable, Dict, FrozenSet, Mapping, Set, Tuple
+
+from loopy.diagnostic import LoopyError
+from loopy.kernel import LoopKernel
+
+
+__doc__ = """
+.. autofunction:: rename_inames_in_batch
+.. autofunction:: get_kennedy_unweighted_fusion_candidates
+"""
+
+
+# {{{ Loop Dependence graph class + builder
+
+
+@dataclass(frozen=True, eq=True)
+class LoopDependenceGraph:
+    """
+    .. attribute:: successors
+
+        A mapping from iname (``i``) to the collection of inames that can be
+        scheduled only after the loop corresponding to ``i`` has been exited.
+
+    .. attribute:: predecessors
+
+        A mapping from iname (``i``) to the collection of inames that must have
+        been exited before entering ``i``.
+
+    .. attribute:: is_infusible
+
+        A mapping from the edges in the loop dependence graph to their
+        fusibility criterion. An edge in this mapping is represented by a pair
+        of inames``(iname_i, iname_j)`` such that the edge ``iname_i ->
+        iname_j`` is present in the graph.
+
+    .. note::
+
+        Both :attr:`successors` and :attr:`predecessors` are maintained to
+        reduce the complexity of graph primitive operations (like remove node,
+        add edge, etc.).
+    """
+
+    successors: Mapping[str, FrozenSet[str]]
+    predecessors: Mapping[str, FrozenSet[str]]
+    is_infusible: Mapping[Tuple[str, str], bool]
+
+    @classmethod
+    def new(cls, successors, is_infusible):
+        predecessors = {node: set() for node in successors}
+        for node, succs in successors.items():
+            for succ in succs:
+                predecessors[succ].add(node)
+
+        predecessors = {
+            node: frozenset(preds) for node, preds in predecessors.items()
+        }
+        successors = {node: frozenset(succs) for node, succs in successors.items()}
+
+        return LoopDependenceGraph(successors, predecessors, is_infusible)
+
+    def is_empty(self):
+        """
+        Returns *True* only if the loop dependence graph contains no nodes.
+        """
+        return len(self.successors) == 0
+
+    def get_loops_with_no_predecessors(self):
+        return {
+            loop for loop, preds in self.predecessors.items() if len(preds) == 0
+        }
+
+    def remove_nodes(self, nodes_to_remove):
+        """
+        Returns a copy of *self* after removing *nodes_to_remove* in the graph.
+        This routine adds necessary edges after removing *nodes_to_remove* to
+        conserve the scheduling constraints present in the graph.
+        """
+        # {{{ Step 1. Remove the nodes
+
+        new_successors = {
+            node: succs
+            for node, succs in self.successors.items()
+            if node not in nodes_to_remove
+        }
+        new_predecessors = {
+            node: preds
+            for node, preds in self.predecessors.items()
+            if node not in nodes_to_remove
+        }
+
+        new_is_infusible = {
+            (from_, to): v
+            for (from_, to), v in self.is_infusible.items()
+            if (from_ not in nodes_to_remove and to not in nodes_to_remove)
+        }
+
+        # }}}
+
+        # {{{ Step 2. Propagate dependencies
+
+        # For every Node 'R' to be removed and every pair (S, P) such that
+        # 1. there exists an edge 'P' -> 'R' in the original graph, and,
+        # 2. there exits an edge 'R' -> 'S' in the original graph.
+        # add the edge 'P' -> 'S' in the new graph.
+
+        for node_to_remove in nodes_to_remove:
+            for succ in self.successors[node_to_remove] - nodes_to_remove:
+                new_predecessors[succ] = new_predecessors[succ] - frozenset(
+                    [node_to_remove]
+                )
+
+            for pred in self.predecessors[node_to_remove] - nodes_to_remove:
+                new_successors[pred] = new_successors[pred] - frozenset(
+                    [node_to_remove]
+                )
+
+        # }}}
+
+        return LoopDependenceGraph(
+            new_successors, new_predecessors, new_is_infusible
+        )
+
+
+@dataclass
+class LoopDependenceGraphBuilder:
+    _dag: Dict[str, Set[str]]
+    _is_infusible: Mapping[Tuple[str, str], bool]
+
+    @classmethod
+    def new(cls, candidates):
+        return LoopDependenceGraphBuilder(
+            {iname: set() for iname in candidates}, {}
+        )
+
+    def add_edge(self, from_: str, to: str, is_infusible: bool):
+        self._dag[from_].add(to)
+        self._is_infusible[(from_, to)] = is_infusible or self._is_infusible.get(
+            (from_, to), False
+        )
+
+    def done(self):
+        """
+        Returns the built :class:`LoopDependenceGraph`.
+        """
+        return LoopDependenceGraph.new(self._dag, self._is_infusible)
+
+
+# }}}
+
+
+def _remove_irrelevant_insns_from_statement_dag(
+    kernel: LoopKernel,
+    insn_to_predecessors: Mapping[str, FrozenSet[str]],
+    insn_to_successors: Mapping[str, FrozenSet[str]],
+    candidates: FrozenSet[str],
+) -> Tuple[
+    Mapping[str, FrozenSet[str]],
+    Mapping[str, FrozenSet[str]],
+    FrozenSet[Tuple[str, str]],
+]:
+    """
+    Removes instructions from the statement DAG represented by
+    *insn_to_predecessors*, *insn_to_successors* that are not nested in
+    *candidates*.
+
+    Returns a new statement DAG ``new_predecessors, new_successors`` , where
+    edges are added between the remaining nodes of the statement DAG to
+    preserve the dependencies in the original DAG.
+    """
+    # {{{ input validation
+
+    assert set(insn_to_predecessors) == set(insn_to_successors)
+    assert all(isinstance(val, frozenset) for val in insn_to_predecessors.values())
+    assert all(isinstance(val, frozenset) for val in insn_to_successors.values())
+
+    # }}}
+
+    insns_to_remove = {
+        insn
+        for insn in insn_to_successors
+        if len(kernel.id_to_insn[insn].within_inames & candidates) == 0
+    }
+
+    new_predecessors = insn_to_predecessors.copy()
+    new_successors = insn_to_successors.copy()
+    infusible_edges_in_statement_dag = set()
+
+    for insn_to_remove in insns_to_remove:
+        for pred in new_predecessors[insn_to_remove]:
+            new_successors[pred] = (
+                new_successors[pred] - frozenset([insn_to_remove])
+            ) | new_successors[insn_to_remove]
+
+        for succ in new_successors[insn_to_remove]:
+            new_predecessors[succ] = (
+                new_predecessors[succ] - frozenset([insn_to_remove])
+            ) | new_predecessors[insn_to_remove]
+
+        for pred in new_predecessors[insn_to_remove]:
+            for succ in new_successors[insn_to_remove]:
+                # now mark the edge from pred -> succ infusible iff both 'pred' and
+                # 'succ' are *not* in insns_to_remove
+                if (pred not in insns_to_remove) and (succ not in insns_to_remove):
+                    infusible_edges_in_statement_dag.add((pred, succ))
+
+        del new_predecessors[insn_to_remove]
+        del new_successors[insn_to_remove]
+
+    return (
+        new_predecessors,
+        new_successors,
+        frozenset(infusible_edges_in_statement_dag),
+    )
+
+
+def _compute_isinfusible_via_access_map(
+    kernel, insn_pred, candidate_pred, insn_succ, candidate_succ, outer_inames, var
+):
+    """
+    Returns *True* if the inames *candidate_pred* and *candidate_succ* are fused then
+    that might lead to a loop carried dependency for *var*.
+    """
+    import islpy as isl
+    import pymbolic.primitives as prim
+
+    from loopy.diagnostic import UnableToDetermineAccessRangeError
+    from loopy.kernel.tools import get_insn_access_map
+    from loopy.symbolic import isl_set_from_expr
+
+    inner_inames_pred = kernel.insn_inames(insn_pred) - (
+        frozenset([candidate_pred]) | outer_inames
+    )
+
+    inner_inames_succ = kernel.insn_inames(insn_succ) - (
+        frozenset([candidate_succ]) | outer_inames
+    )
+
+    try:
+        amap_pred = get_insn_access_map(kernel, insn_pred, var, inner_inames_pred)
+        amap_succ = get_insn_access_map(kernel, insn_succ, var, inner_inames_succ)
+    except UnableToDetermineAccessRangeError:
+        # either predecessors or successors has a non-affine access i.e.
+        # fallback to the safer option => infusible
+        return True
+
+    # since both ranges denote the same variable they must be subscripted with
+    # the same number of indices.
+    assert amap_pred.dim(isl.dim_type.out) == amap_succ.dim(isl.dim_type.out)
+
+    ndim = amap_pred.dim(isl.dim_type.out)
+
+    # {{{ set the out dim names as `amap_a_dim0`, `amap_a_dim1`, ...
+
+    for idim in range(ndim):
+        amap_pred = amap_pred.set_dim_name(
+            isl.dim_type.out, idim, f"_lpy_amap_a_dim{idim}"
+        )
+        amap_succ = amap_succ.set_dim_name(
+            isl.dim_type.out, idim, f"_lpy_amap_b_dim{idim}"
+        )
+
+    # }}}
+
+    # {{{ amap_pred -> set_pred, amap_succ -> set_succ
+
+    amap_pred = amap_pred.move_dims(
+        isl.dim_type.in_,
+        amap_pred.dim(isl.dim_type.in_),
+        isl.dim_type.out,
+        0,
+        amap_pred.dim(isl.dim_type.out),
+    )
+
+    amap_succ = amap_succ.move_dims(
+        isl.dim_type.in_,
+        amap_succ.dim(isl.dim_type.in_),
+        isl.dim_type.out,
+        0,
+        amap_succ.dim(isl.dim_type.out),
+    )
+
+    set_pred, set_succ = amap_pred.domain(), amap_succ.domain()
+    set_pred, set_succ = isl.align_two(set_pred, set_succ)
+
+    # }}}
+
+    # {{{ build the bset, both accesses access the same element
+
+    accesses_same_index_set = isl.BasicSet.universe(set_pred.space)
+    for idim in range(ndim):
+        cnstrnt = isl.Constraint.eq_from_names(
+            set_pred.space,
+            {f"_lpy_amap_a_dim{idim}": 1, f"_lpy_amap_b_dim{idim}": -1},
+        )
+        accesses_same_index_set = accesses_same_index_set.add_constraint(cnstrnt)
+
+    # }}}
+
+    candidates_not_equal = isl_set_from_expr(
+        set_pred.space,
+        prim.Comparison(
+            prim.Variable(candidate_pred), ">", prim.Variable(candidate_succ)
+        ),
+    )
+    return not (
+        set_pred & set_succ & accesses_same_index_set & candidates_not_equal
+    ).is_empty()
+
+
+def _preprocess_deps(
+    kernel: LoopKernel,
+    deps: FrozenSet[str],
+    candidates: FrozenSet[str],
+    outer_inames: FrozenSet[str],
+) -> FrozenSet[str]:
+    all_deps = set()
+
+    for dep in deps:
+        if kernel.id_to_insn[dep].within_inames == outer_inames:
+            all_deps.add(dep)
+        elif kernel.id_to_insn[dep].within_inames & candidates:
+            all_deps.add(dep)
+        else:
+            all_deps |= reduce(
+                frozenset.intersection,
+                (
+                    kernel.iname_to_insns()[iname]
+                    for iname in kernel.id_to_insn[dep].within_inames
+                ),
+                frozenset(kernel.id_to_insn),
+            )
+
+    return frozenset(all_deps)
+
+
+def _build_ldg(
+    kernel: LoopKernel, candidates: FrozenSet[str], outer_inames: FrozenSet[str]
+):
+    """
+    Returns an instance of :class:`LoopDependenceGraph` needed while fusing
+    *candidates*. Invoked as a helper function in
+    :func:`get_kennedy_unweighted_fusion_candidates`.
+    """
+
+    from pytools.graph import compute_topological_order
+
+    insns = reduce(
+        frozenset.intersection,
+        (frozenset(kernel.iname_to_insns()[iname]) for iname in outer_inames),
+        frozenset(kernel.id_to_insn),
+    )
+    predecessors = {
+        insn: (
+            _preprocess_deps(
+                kernel,
+                kernel.id_to_insn[insn].depends_on,
+                candidates=candidates,
+                outer_inames=outer_inames,
+            )
+            & insns
+        )
+        for insn in insns
+    }
+    successors = {insn: frozenset() for insn in insns}
+
+    for insn, preds in predecessors.items():
+        for pred in preds:
+            successors[pred] |= frozenset([insn])
+
+    predecessors, successors, infusible_edges = (
+        _remove_irrelevant_insns_from_statement_dag(
+            kernel, predecessors, successors, candidates
+        )
+    )
+
+    builder = LoopDependenceGraphBuilder.new(candidates)
+
+    # Interpret the statement DAG as LDG
+    for pred, succs in successors.items():
+        for succ in succs:
+            (succ_candidate,) = kernel.id_to_insn[succ].within_inames & candidates
+            (pred_candidate,) = kernel.id_to_insn[pred].within_inames & candidates
+            builder.add_edge(
+                pred_candidate, succ_candidate, (pred, succ) in infusible_edges
+            )
+
+    # {{{ add infusible edges to the LDG depending on memory deps.
+
+    all_candidate_insns = reduce(
+        frozenset.union,
+        (kernel.iname_to_insns()[iname] for iname in candidates),
+        frozenset(),
+    )
+
+    dep_inducing_vars = reduce(
+        frozenset.union,
+        (
+            frozenset(kernel.id_to_insn[insn].assignee_var_names())
+            for insn in all_candidate_insns
+        ),
+        frozenset(),
+    )
+    wmap = kernel.writer_map()
+    rmap = kernel.reader_map()
+
+    topo_order = {
+        el: i for i, el in enumerate(compute_topological_order(successors))
+    }
+
+    for var in dep_inducing_vars:
+        for writer_id in wmap.get(var, frozenset()) & all_candidate_insns:
+            for access_id in (
+                rmap.get(var, frozenset()) | wmap.get(var, frozenset())
+            ) & all_candidate_insns:
+                if writer_id == access_id:
+                    # no need to add self dependence
+                    continue
+
+                pred, succ = sorted([writer_id, access_id], key=topo_order.get)
+                (succ_candidate,) = (
+                    kernel.id_to_insn[succ].within_inames & candidates
+                )
+                (pred_candidate,) = (
+                    kernel.id_to_insn[pred].within_inames & candidates
+                )
+
+                is_infusible = _compute_isinfusible_via_access_map(
+                    kernel,
+                    pred,
+                    pred_candidate,
+                    succ,
+                    succ_candidate,
+                    outer_inames,
+                    var,
+                )
+
+                builder.add_edge(pred_candidate, succ_candidate, is_infusible)
+
+    # }}}
+
+    return builder.done()
+
+
+def _fuse_sequential_loops_with_outer_loops(
+    kernel: LoopKernel,
+    candidates: FrozenSet[str],
+    outer_inames: FrozenSet[str],
+    name_gen: Callable[[str], str],
+    prefix: str,
+):
+    ldg = _build_ldg(kernel, candidates, outer_inames)
+
+    fused_chunks = {}
+
+    while not ldg.is_empty():
+
+        # sorting to have a deterministic order.
+        queue = sorted(ldg.get_loops_with_no_predecessors())
+        loops_to_be_fused = set()
+        non_fusible_loops = set()
+        while queue:
+            next_loop_in_queue = queue[0]
+            queue = queue[1:]
+            if not (ldg.predecessors[next_loop_in_queue] <= loops_to_be_fused):
+                # this loop still needs some other loops to be scheduled
+                # before we can reach this.
+                # Bye bye 'next_loop_in_queue' :'( , see you when all your
+                # predecessors have been scheduled.
+                continue
+
+            if next_loop_in_queue in non_fusible_loops:
+                # had an non-fusible edge with an already schedule loop.
+                # Sorry 'next_loop_in_queue', until next time :'(.
+                continue
+
+            loops_to_be_fused.add(next_loop_in_queue)
+
+            for succ in ldg.successors[next_loop_in_queue]:
+                if ldg.is_infusible.get((next_loop_in_queue, succ), False):
+                    non_fusible_loops.add(succ)
+                else:
+                    queue.append(succ)
+
+        ldg = ldg.remove_nodes(loops_to_be_fused)
+        fused_chunks[name_gen(prefix)] = loops_to_be_fused
+
+    assert reduce(frozenset.union, fused_chunks.values(), frozenset()) == candidates
+    assert sum(len(val) for val in fused_chunks.values()) == len(candidates)
+
+    return fused_chunks
+
+
+def get_kennedy_unweighted_fusion_candidates(
+    kernel: LoopKernel, candidates: FrozenSet[str], prefix: str = "ifused"
+) -> Mapping[str, FrozenSet[str]]:
+    """
+    Returns the fusion candidates mapping that could be fed to
+    :func:`rename_inames_in_batch` similar to Ken Kennedy's Unweighted
+    Loop-Fusion Algorithm.
+
+    .. attribute:: prefix
+
+        Prefix for the fused inames.
+    """
+    from loopy.kernel.data import ConcurrentTag
+    from loopy.schedule.tools import (
+        _get_iname_to_tree_node_id_from_partial_loop_nest_tree,
+        get_partial_loop_nest_tree,
+    )
+
+    assert isinstance(kernel, LoopKernel)
+    assert isinstance(candidates, frozenset)
+
+    vng = kernel.get_var_name_generator()
+    fused_chunks = {}
+
+    # {{{ handle concurrent inames
+
+    # filter out concurrent loops.
+    all_concurrent_tags = reduce(
+        frozenset.union,
+        (kernel.inames[iname].tags_of_type(ConcurrentTag) for iname in candidates),
+        frozenset(),
+    )
+
+    concurrent_tag_to_inames = {tag: set() for tag in all_concurrent_tags}
+
+    for iname in candidates:
+        if kernel.inames[iname].tags_of_type(ConcurrentTag):
+            # since ConcurrentTag is a UniqueTag there must be exactly one of
+            # it.
+            (tag,) = kernel.tags_of_type(ConcurrentTag)
+            concurrent_tag_to_inames[tag].add(iname)
+
+    for inames in concurrent_tag_to_inames.values():
+        fused_chunks[vng(prefix)] = inames
+        candidates = candidates - inames
+
+    # }}}
+
+    tree = get_partial_loop_nest_tree(kernel)
+    iname_to_tree_node_id = _get_iname_to_tree_node_id_from_partial_loop_nest_tree(
+        tree
+    )
+
+    # {{{ sanitary checks
+
+    _nest_tree_id_to_candidate = {}
+
+    for iname in candidates:
+        loop_nest_tree_node_id = iname_to_tree_node_id[iname]
+        if loop_nest_tree_node_id not in _nest_tree_id_to_candidate:
+            _nest_tree_id_to_candidate[loop_nest_tree_node_id] = iname
+        else:
+            conflict_iname = _nest_tree_id_to_candidate[loop_nest_tree_node_id]
+            raise LoopyError(
+                f"'{iname}' and '{conflict_iname}' "
+                "cannot be fused as they can be nested "
+                "within one another."
+            )
+
+    for iname in candidates:
+        outer_loops = reduce(
+            frozenset.union,
+            tree.ancestors(iname_to_tree_node_id[iname]),
+            frozenset(),
+        )
+        if outer_loops & candidates:
+            raise LoopyError(
+                f"Cannot fuse '{iname}' with"
+                f" '{outer_loops & candidates}' as they"
+                " maybe nesting within one another."
+            )
+
+    del _nest_tree_id_to_candidate
+
+    # }}}
+
+    # just_outer_loop_nest: mapping from loop nest to the candidates they
+    # contain
+    just_outer_loop_nest = {
+        tree.parent(iname_to_tree_node_id[iname]): set() for iname in candidates
+    }
+
+    for iname in candidates:
+        just_outer_loop_nest[tree.parent(iname_to_tree_node_id[iname])].add(iname)
+
+    for outer_inames, inames in just_outer_loop_nest.items():
+        fused_chunks.update(
+            _fuse_sequential_loops_with_outer_loops(
+                kernel, frozenset(inames), outer_inames, vng, prefix
+            )
+        )
+
+    return fused_chunks
+
+
+def rename_inames_in_batch(
+    kernel: LoopKernel, batches: Mapping[str, FrozenSet[str]]
+) -> LoopKernel:
+    """
+    Returns a copy of *kernel* with inames renamed according to *batches*.
+
+    :arg kernel: An instance of :class:`loopy.LoopKernel`.
+    :arg batches: A mapping from ``new_iname`` to a :class:`frozenset` of
+        inames that are to be renamed to ``new_iname``.
+    """
+    from loopy.transform.iname import rename_iname
+
+    for new_iname, candidates in batches.items():
+        for iname in candidates:
+            kernel = rename_iname(kernel, iname, new_iname, existing_ok=True)
+
+    return kernel
+
+
+# vim: foldmethod=marker

From a05f94df318bf5608b4519dd359fabbdfc03d73a Mon Sep 17 00:00:00 2001
From: Kaushik Kulkarni <kaushikcfd@gmail.com>
Date: Sun, 31 Oct 2021 16:56:41 -0500
Subject: [PATCH 3/3] Test loop fusion implementation

---
 test/test_loop_fusion.py | 276 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 276 insertions(+)
 create mode 100644 test/test_loop_fusion.py

diff --git a/test/test_loop_fusion.py b/test/test_loop_fusion.py
new file mode 100644
index 000000000..a155f2267
--- /dev/null
+++ b/test/test_loop_fusion.py
@@ -0,0 +1,276 @@
+__copyright__ = "Copyright (C) 2021 Kaushik Kulkarni"
+
+__license__ = """
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+"""
+
+import logging
+import sys
+
+import numpy as np
+
+import pyopencl as cl
+import pyopencl.clmath  # noqa
+import pyopencl.clrandom  # noqa
+
+import loopy as lp
+
+
+logger = logging.getLogger(__name__)
+
+try:
+    import faulthandler
+except ImportError:
+    pass
+else:
+    faulthandler.enable()
+
+from pyopencl.tools import pytest_generate_tests_for_pyopencl as pytest_generate_tests
+
+from loopy.version import LOOPY_USE_LANGUAGE_VERSION_2018_2  # noqa
+
+
+__all__ = [
+    "cl",  # "cl.create_some_context"
+    "pytest_generate_tests"
+]
+
+
+def test_loop_fusion_vanilla(ctx_factory):
+    ctx = ctx_factory()
+
+    knl = lp.make_kernel(
+        "{[i0, i1, j0, j1]: 0 <= i0, i1, j0, j1 < 10}",
+        """
+        a[i0]     = 1
+        b[i1, j0] = 2  {id=write_b}
+        c[j1]     = 3  {id=write_c}
+        """)
+    ref_knl = knl
+
+    fused_chunks = lp.get_kennedy_unweighted_fusion_candidates(knl["loopy_kernel"],
+                                                    frozenset(["j0", "j1"]))
+
+    knl = knl.with_kernel(lp.rename_inames_in_batch(knl["loopy_kernel"],
+                                                    fused_chunks))
+    assert len(ref_knl["loopy_kernel"].all_inames()) == 4
+    assert len(knl["loopy_kernel"].all_inames()) == 3
+    assert len(knl["loopy_kernel"].id_to_insn["write_b"].within_inames
+               & knl["loopy_kernel"].id_to_insn["write_c"].within_inames) == 1
+
+    lp.auto_test_vs_ref(ref_knl, ctx, knl)
+
+
+def test_loop_fusion_outer_iname_preventing_fusion(ctx_factory):
+    ctx = ctx_factory()
+
+    knl = lp.make_kernel(
+        "{[i0, j0, j1]: 0 <= i0, j0, j1 < 10}",
+        """
+        a[i0]     = 1
+        b[i0, j0] = 2 {id=write_b}
+        c[j1]     = 3 {id=write_c}
+        """)
+    ref_knl = knl
+
+    fused_chunks = lp.get_kennedy_unweighted_fusion_candidates(knl["loopy_kernel"],
+                                                    frozenset(["j0", "j1"]))
+
+    knl = knl.with_kernel(lp.rename_inames_in_batch(knl["loopy_kernel"],
+                                                    fused_chunks))
+
+    assert len(knl["loopy_kernel"].all_inames()) == 3
+    assert len(knl["loopy_kernel"].all_inames()) == 3
+    assert len(knl["loopy_kernel"].id_to_insn["write_b"].within_inames
+               & knl["loopy_kernel"].id_to_insn["write_c"].within_inames) == 0
+
+    lp.auto_test_vs_ref(ref_knl, ctx, knl)
+
+
+def test_loop_fusion_with_loop_independent_deps(ctx_factory):
+    ctx = ctx_factory()
+
+    knl = lp.make_kernel(
+        "{[j0, j1]: 0 <= j0, j1 < 10}",
+        """
+        a[j0]  = 1
+        b[j1] = 2 * a[j1]
+        """, seq_dependencies=True)
+
+    ref_knl = knl
+
+    fused_chunks = lp.get_kennedy_unweighted_fusion_candidates(knl["loopy_kernel"],
+                                                    frozenset(["j0", "j1"]))
+
+    knl = knl.with_kernel(lp.rename_inames_in_batch(knl["loopy_kernel"],
+                                                    fused_chunks))
+
+    assert len(ref_knl["loopy_kernel"].all_inames()) == 2
+    assert len(knl["loopy_kernel"].all_inames()) == 1
+
+    lp.auto_test_vs_ref(ref_knl, ctx, knl)
+
+
+def test_loop_fusion_constrained_by_outer_loop_deps(ctx_factory):
+    ctx = ctx_factory()
+
+    knl = lp.make_kernel(
+        "{[j0, j1]: 0 <= j0, j1 < 10}",
+        """
+        a[j0] = 1         {id=write_a}
+        b     = 2         {id=write_b}
+        c[j1] = 2 * a[j1] {id=write_c}
+        """, seq_dependencies=True)
+
+    ref_knl = knl
+
+    fused_chunks = lp.get_kennedy_unweighted_fusion_candidates(knl["loopy_kernel"],
+                                                    frozenset(["j0", "j1"]))
+
+    knl = knl.with_kernel(lp.rename_inames_in_batch(knl["loopy_kernel"],
+                                                    fused_chunks))
+
+    assert len(ref_knl["loopy_kernel"].all_inames()) == 2
+    assert len(knl["loopy_kernel"].all_inames()) == 2
+    assert len(knl["loopy_kernel"].id_to_insn["write_a"].within_inames
+               & knl["loopy_kernel"].id_to_insn["write_c"].within_inames) == 0
+
+    lp.auto_test_vs_ref(ref_knl, ctx, knl)
+
+
+def test_loop_fusion_with_loop_carried_deps1(ctx_factory):
+
+    ctx = ctx_factory()
+    knl = lp.make_kernel(
+        "{[i0, i1]: 1<=i0, i1<10}",
+        """
+        x[i0] = i0         {id=first_write}
+        x[i1-1] = i1 ** 2  {id=second_write}
+        """,
+        seq_dependencies=True)
+
+    ref_knl = knl
+
+    fused_chunks = lp.get_kennedy_unweighted_fusion_candidates(knl["loopy_kernel"],
+                                                               frozenset(["i0",
+                                                                          "i1"]))
+
+    knl = knl.with_kernel(lp.rename_inames_in_batch(knl["loopy_kernel"],
+                                                    fused_chunks))
+
+    assert len(ref_knl["loopy_kernel"].all_inames()) == 2
+    assert len(knl["loopy_kernel"].all_inames()) == 1
+    assert len(knl["loopy_kernel"].id_to_insn["first_write"].within_inames
+               & knl["loopy_kernel"].id_to_insn["second_write"].within_inames) == 1
+
+    lp.auto_test_vs_ref(ref_knl, ctx, knl)
+
+
+def test_loop_fusion_with_loop_carried_deps2(ctx_factory):
+    ctx = ctx_factory()
+    knl = lp.make_kernel(
+        "{[i0, i1]: 1<=i0, i1<10}",
+        """
+        x[i0-1] = i0     {id=first_write}
+        x[i1] = i1 ** 2  {id=second_write}
+        """,
+        seq_dependencies=True)
+
+    ref_knl = knl
+
+    fused_chunks = lp.get_kennedy_unweighted_fusion_candidates(knl["loopy_kernel"],
+                                                               frozenset(["i0",
+                                                                          "i1"]))
+
+    knl = knl.with_kernel(lp.rename_inames_in_batch(knl["loopy_kernel"],
+                                                    fused_chunks))
+
+    assert len(ref_knl["loopy_kernel"].all_inames()) == 2
+    assert len(knl["loopy_kernel"].all_inames()) == 2
+    assert len(knl["loopy_kernel"].id_to_insn["first_write"].within_inames
+               & knl["loopy_kernel"].id_to_insn["second_write"].within_inames) == 0
+
+    lp.auto_test_vs_ref(ref_knl, ctx, knl)
+
+
+def test_loop_fusion_with_indirection(ctx_factory):
+    ctx = ctx_factory()
+    map_ = np.random.permutation(10)
+    cq = cl.CommandQueue(ctx)
+
+    knl = lp.make_kernel(
+        "{[i0, i1]: 0<=i0, i1<10}",
+        """
+        x[i0] = i0            {id=first_write}
+        x[map[i1]] = i1 ** 2  {id=second_write}
+        """,
+        seq_dependencies=True)
+
+    ref_knl = knl
+
+    fused_chunks = lp.get_kennedy_unweighted_fusion_candidates(knl["loopy_kernel"],
+                                                               frozenset(["i0",
+                                                                          "i1"]))
+
+    knl = knl.with_kernel(lp.rename_inames_in_batch(knl["loopy_kernel"],
+                                                    fused_chunks))
+
+    assert len(ref_knl["loopy_kernel"].all_inames()) == 2
+    assert len(knl["loopy_kernel"].all_inames()) == 2
+    assert len(knl["loopy_kernel"].id_to_insn["first_write"].within_inames
+               & knl["loopy_kernel"].id_to_insn["second_write"].within_inames) == 0
+
+    _, (out1,) = ref_knl(cq, map=map_)
+    _, (out2,) = knl(cq, map=map_)
+    np.testing.assert_allclose(out1, out2)
+
+
+def test_loop_fusion_with_induced_dependencies_from_sibling_nests(ctx_factory):
+    ctx = ctx_factory()
+    t_unit = lp.make_kernel(
+        "{[i0, j, i1, i2]: 0<=i0, j, i1, i2<10}",
+        """
+        <> tmp0[i0] = i0
+        <> tmp1[j] = tmp0[j]
+        <> tmp2[j] = j
+        out1[i1] = tmp2[i1]
+        out2[i2] = 2 * tmp1[i2]
+        """)
+    ref_t_unit = t_unit
+    knl = t_unit.default_entrypoint
+    knl = lp.rename_inames_in_batch(
+        knl,
+        lp.get_kennedy_unweighted_fusion_candidates(
+            knl, frozenset(["i0", "i1"])))
+    t_unit = t_unit.with_kernel(knl)
+
+    # 'i1', 'i2' should not be fused. If fused that would lead to an
+    # unshcedulable kernel. Making sure that the kernel 'runs' suffices that
+    # the transformation was successful.
+    lp.auto_test_vs_ref(ref_t_unit, ctx, t_unit)
+
+
+if __name__ == "__main__":
+    if len(sys.argv) > 1:
+        exec(sys.argv[1])
+    else:
+        from pytest import main
+        main([__file__])
+
+# vim: fdm=marker