Qualcomm AI Engine Direct - xr model enablement (mld_f)

haowhsu-quic · haowhsu-quic · commit 2327a56fcd86 · 2025-04-29T17:51:02.000+08:00
Summary
- add gather op support
- make cast / slice op more general
diff --git a/backends/qualcomm/_passes/qnn_pass_manager.py b/backends/qualcomm/_passes/qnn_pass_manager.py
@@ -182,6 +182,7 @@ def transform_for_to_edge_pipeline(
 
     # Before quantizer
     def transform_for_annotation_pipeline(self, graph_module: GraphModule):
+        self.add_pass(RemoveRedundancy(quantization_capture=True))
         self.add_pass(ReduceDynamicRange())
         self.add_pass(RecomposePixelUnshuffle(quantization_capture=True))
         self.add_pass(ReplaceArangeArgs())
diff --git a/backends/qualcomm/_passes/remove_redundancy.py b/backends/qualcomm/_passes/remove_redundancy.py
@@ -14,9 +14,9 @@ class RemoveRedundancy(ExportPass):
     Trim certain operators to reduce unnecessary overhead.
     """
 
-    def __init__(self):
+    def __init__(self, quantization_capture=False):
         super(RemoveRedundancy, self).__init__()
-        self.redundant_ops = {
+        self.redundant_ops_general = {
             torch.clone: self._default_condition,
             torch.ops.aten.clone.default: self._default_condition,
             exir_ops.edge.aten.clone.default: self._default_condition,
@@ -27,7 +27,16 @@ def __init__(self):
             exir_ops.edge.dim_order_ops._to_dim_order_copy.default: self._dim_order_op_condition,
             # remove channel_last / contiguous _to_copy if '_skip_dim_order' is set to True
             exir_ops.edge.aten._to_copy.default: self._to_copy_op_condition,
+            torch.ops.aten._assert_tensor_metadata.default: self._default_condition,
         }
+        self.redundant_ops_annotation = {
+            torch.ops.aten._assert_tensor_metadata.default: self._default_condition,
+        }
+        self.redundant_ops = (
+            self.redundant_ops_annotation
+            if quantization_capture
+            else self.redundant_ops_general
+        )
 
     def _dim_order_op_condition(self, node):
         dim_order = node.kwargs.get("dim_order")
@@ -49,6 +58,10 @@ def _remove(self, graph_module: torch.fx.GraphModule) -> torch.fx.GraphModule:
                 continue
 
             to_be_remove = n
+            # assert_tensor_metadata op has no user
+            if len(n.users.keys()) == 0:
+                n.args = ()
+            # normal case
             for user_n in list(n.users.keys()):
                 user_n.replace_input_with(n, n.args[0])
             graph_module.graph.erase_node(to_be_remove)
diff --git a/backends/qualcomm/builders/__init__.py b/backends/qualcomm/builders/__init__.py
@@ -32,6 +32,7 @@
     op_expand,
     op_full,
     op_full_like,
+    op_gather,
     op_ge,
     op_gelu,
     op_group_norm,
@@ -120,6 +121,7 @@
     op_expand,
     op_full,
     op_full_like,
+    op_gather,
     op_ge,
     op_gelu,
     op_group_norm,
diff --git a/backends/qualcomm/builders/op_gather.py b/backends/qualcomm/builders/op_gather.py
@@ -0,0 +1,101 @@
+# Copyright (c) Qualcomm Innovation Center, Inc.
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+from typing import cast, Dict
+
+import executorch.backends.qualcomm.python.PyQnnWrapperAdaptor as PyQnnWrapper
+
+import numpy as np
+import torch
+from executorch.backends.qualcomm.utils.constants import QCOM_DATA
+from executorch.exir.dialects._ops import ops as exir_ops
+
+from .node_visitor import NodeVisitor, register_node_visitor
+from .qnn_constants import OpCast, OpGatherElements, QNN_OP_PACKAGE_NAME_QTI_AISW
+
+
+@register_node_visitor
+class Gather(NodeVisitor):
+    target = ["aten.gather.default"]
+
+    def __init__(self, *args) -> None:
+        super().__init__(*args)
+
+    def define_node(
+        self,
+        node: torch.fx.Node,
+        nodes_to_wrappers: Dict[torch.fx.Node, PyQnnWrapper.TensorWrapper],
+    ) -> PyQnnWrapper.PyQnnOpWrapper:
+        input_node = node.args[0]
+        input_tensor = self.get_tensor(input_node, node)
+        input_tensor_wrapper = self.define_tensor(
+            input_node,
+            node,
+            input_tensor,
+            PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_NATIVE,
+            nodes_to_wrappers,
+        )
+
+        dim = cast(int, node.args[1])
+
+        indices_node = node.args[2]
+        indices_tensor = self.get_tensor(indices_node, node)
+        indices_tensor_wrapper = self.define_tensor(
+            indices_node,
+            node,
+            indices_tensor,
+            PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_NATIVE,
+            nodes_to_wrappers,
+        )
+
+        cast_node = self.edge_program.graph.create_node(
+            "call_function",
+            exir_ops.edge.aten._to_copy.default,
+            (indices_node,),
+            {"dtype": torch.int32},
+        )
+        cast_node.meta["val"] = indices_node.meta["val"].to(torch.int32)
+        cast_tensor = self.get_tensor(cast_node, node)
+        cast_tensor_wrapper = self.define_tensor(
+            cast_node,
+            node,
+            cast_tensor,
+            PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_NATIVE,
+            nodes_to_wrappers,
+        )
+        # graph is not allowed to be modified in partition stage
+        # erase it here to prevent lowering failure
+        self.edge_program.graph.erase_node(cast_node)
+        cast_op = PyQnnWrapper.PyQnnOpWrapper(
+            f"{node.name}_cast_i64_to_i32", QNN_OP_PACKAGE_NAME_QTI_AISW, OpCast.op_name
+        )
+        cast_op.AddInputTensors([indices_tensor_wrapper])
+        cast_op.AddOutputTensors([cast_tensor_wrapper])
+
+        gather_input_tensors = [input_tensor_wrapper, cast_tensor_wrapper]
+        output_tensor = self.get_tensor(node, node)
+        output_tensor_wrapper = self.define_tensor(
+            node,
+            node,
+            output_tensor,
+            PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_NATIVE,
+            nodes_to_wrappers,
+        )
+        gather_output_tensors = [output_tensor_wrapper]
+
+        gather_op = PyQnnWrapper.PyQnnOpWrapper(
+            node.name,
+            QNN_OP_PACKAGE_NAME_QTI_AISW,
+            OpGatherElements.op_name,
+        )
+        gather_op.AddInputTensors(gather_input_tensors)
+        gather_op.AddOutputTensors(gather_output_tensors)
+        gather_op.AddScalarParam(
+            OpGatherElements.param_axis,
+            PyQnnWrapper.Qnn_DataType_t.QNN_DATATYPE_UINT_32,
+            {QCOM_DATA: np.uint32(dim)},
+        )
+
+        return [cast_op, gather_op]
diff --git a/backends/qualcomm/builders/op_slice_copy.py b/backends/qualcomm/builders/op_slice_copy.py
@@ -50,12 +50,17 @@ def define_node(
         dim = cast(int, node.args[1])
         if dim < 0:
             dim = dim % len(input_tensor.shape)
-        start = cast(int, node.args[2])
+
+        start = 0 if node.args[2] is None else cast(int, node.args[2])
         if start < 0:
             start = start % input_tensor.shape[dim]
-        end = min(cast(int, node.args[3]), input_tensor.shape[dim])
-        if end < 0:
-            end = end % input_tensor.shape[dim]
+
+        if len(node.args) > 3:
+            end = min(cast(int, node.args[3]), input_tensor.shape[dim])
+            if end < 0:
+                end = end % input_tensor.shape[dim]
+        else:
+            end = input_tensor.shape[dim]
 
         input_tensor_rank = len(input_tensor.shape)
         ranges = []
diff --git a/backends/qualcomm/builders/op_to.py b/backends/qualcomm/builders/op_to.py
@@ -9,6 +9,7 @@
 
 import torch
 from executorch.backends.qualcomm.utils.constants import QCOM_QUANT_ATTRS
+from executorch.exir.dialects._ops import ops as exir_ops
 
 from .node_visitor import NodeVisitor, register_node_visitor
 from .qnn_constants import OpCast, OpConvert, QNN_OP_PACKAGE_NAME_QTI_AISW
@@ -90,9 +91,44 @@ def define_node(
             PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_NATIVE,
             nodes_to_wrappers,
         )
+        node_input_tensors = [input_tensor_wrapper]
+
+        # if the output dtype is int64, we should cast it to int32 first
+        # since int32 is the only source that can be casted into int64
+        ops = []
+        if (
+            node.meta["val"].dtype == torch.int64
+            or input_node.meta["val"].dtype == torch.int64
+        ):
+            cast_node = self.edge_program.graph.create_node(
+                "call_function",
+                exir_ops.edge.aten._to_copy.default,
+                (input_node,),
+                {"dtype": torch.int32},
+            )
+            cast_node.meta["val"] = input_node.meta["val"].to(torch.int32)
+            cast_tensor = self.get_tensor(cast_node, node)
+            cast_tensor_wrapper = self.define_tensor(
+                cast_node,
+                node,
+                cast_tensor,
+                PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_NATIVE,
+                nodes_to_wrappers,
+            )
+            # graph is not allowed to be modified in partition stage
+            # erase it here to prevent lowering failure
+            self.edge_program.graph.erase_node(cast_node)
+            cast_op = PyQnnWrapper.PyQnnOpWrapper(
+                f"{node.name}_cast_i64_to_i32",
+                QNN_OP_PACKAGE_NAME_QTI_AISW,
+                OpCast.op_name,
+            )
+            node_input_tensors = [cast_tensor_wrapper]
+            cast_op.AddInputTensors([input_tensor_wrapper])
+            cast_op.AddOutputTensors([cast_tensor_wrapper])
+            ops.append(cast_op)
 
         output_tensor = self.get_tensor(node, node)
-
         output_tensor_wrapper = self.define_tensor(
             node,
             node,
@@ -105,7 +141,8 @@ def define_node(
         op = PyQnnWrapper.PyQnnOpWrapper(
             node.name, QNN_OP_PACKAGE_NAME_QTI_AISW, qnn_op.op_name
         )
-        op.AddInputTensors([input_tensor_wrapper])
+        op.AddInputTensors(node_input_tensors)
         op.AddOutputTensors([output_tensor_wrapper])
+        ops.append(op)
 
-        return op
+        return ops
diff --git a/backends/qualcomm/builders/qnn_constants.py b/backends/qualcomm/builders/qnn_constants.py
@@ -252,6 +252,12 @@ class OpGather:
     param_axis: str = "axis"
 
 
+@dataclass(init=False, frozen=True)
+class OpGatherElements:
+    op_name: str = "GatherElements"
+    param_axis: str = "axis"
+
+
 @dataclass(init=False, frozen=True)
 class OpGatherND:
     op_name: str = "GatherNd"
diff --git a/backends/qualcomm/quantizer/annotators.py b/backends/qualcomm/quantizer/annotators.py
@@ -750,7 +750,7 @@ def annotate_elu(node: Node, quantization_config: QuantizationConfig) -> None:
     annotate_single_in_single_out(node, quantization_config)
 
 
-@register_annotator([torch.ops.aten.embedding.default])
+@register_annotator([torch.ops.aten.embedding.default, torch.ops.aten.gather.default])
 def annotate_embedding(node: Node, quantization_config: QuantizationConfig) -> None:
     weight = node.args[0]
 
diff --git a/backends/qualcomm/tests/models.py b/backends/qualcomm/tests/models.py
@@ -729,6 +729,17 @@ def forward(self, x):
         return torch.min(x, torch.full_like(x, self.fill))
 
 
+class Gather(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x, y):
+        index = torch.where(y > 0, torch.Tensor([1]).int(), torch.Tensor([1]).int()).to(
+            torch.int64
+        )
+        return torch.gather(x, x.dim() - 1, index)
+
+
 class Gelu(torch.nn.Module):
     def __init__(self):
         super().__init__()
@@ -1398,6 +1409,14 @@ def forward(self, x, y):
         return x[:, :seq_length] + self.position_ids[:, :seq_length]
 
 
+class SliceCopyDefaultParameter(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x):
+        return torch.cat([x[:1], x[1:]], dim=1)
+
+
 class SliceCopyWithStep(torch.nn.Module):
     def __init__(self):
         super().__init__()
diff --git a/backends/qualcomm/tests/test_qnn_delegate.py b/backends/qualcomm/tests/test_qnn_delegate.py
@@ -478,6 +478,13 @@ def test_qnn_backend_full_like(self):
         sample_input = (torch.randn(1, 2, 3, 4),)
         self.lower_module_and_test_output(module, sample_input)
 
+    def test_qnn_backend_gather(self):
+        module = Gather()  # noqa: F405
+        shape = (2, 2, 3, 4)
+        sample_input = (torch.randn(shape), torch.randn(shape))
+        module = self.get_qdq_module(module, sample_input)
+        self.lower_module_and_test_output(module, sample_input)
+
     def test_qnn_backend_gelu(self):
         module = Gelu()  # noqa: F405
         sample_input = (torch.randn(2, 5, 1, 3),)
@@ -821,12 +828,17 @@ def test_qnn_backend_select_copy(self):
         self.lower_module_and_test_output(module, sample_input)
 
     def test_qnn_backend_slice_copy(self):
-        modules = [SliceCopy(), SliceCopyWithStep()]  # noqa: F405
-        sample_input = (
-            torch.randn([1, 512]),
-            torch.randn([1, 8]),
-        )
-        for module in modules:
+        modules = [
+            SliceCopyDefaultParameter(),
+            SliceCopy(),
+            SliceCopyWithStep(),
+        ]  # noqa: F405
+        sample_inputs = [
+            (torch.randn([2, 1, 320, 512]),),
+            (torch.randn([1, 512]), torch.randn([1, 8])),
+            (torch.randn([1, 512]), torch.randn([1, 8])),
+        ]
+        for module, sample_input in zip(modules, sample_inputs):
             self.lower_module_and_test_output(module, sample_input)
 
     def test_qnn_backend_stack(self):
@@ -1593,6 +1605,13 @@ def test_qnn_backend_full_like(self):
         module = self.get_qdq_module(module, sample_input)
         self.lower_module_and_test_output(module, sample_input)
 
+    def test_qnn_backend_gather(self):
+        module = Gather()  # noqa: F405
+        shape = (2, 2, 3, 4)
+        sample_input = (torch.randn(shape), torch.randn(shape))
+        module = self.get_qdq_module(module, sample_input)
+        self.lower_module_and_test_output(module, sample_input)
+
     def test_qnn_backend_gelu(self):
         module = Gelu()  # noqa: F405
         sample_input = (torch.randn(2, 5, 1, 3),)
@@ -1991,12 +2010,17 @@ def test_qnn_backend_sin(self):
         self.lower_module_and_test_output(module, sample_input)
 
     def test_qnn_backend_slice_copy(self):
-        modules = [SliceCopy(), SliceCopyWithStep()]  # noqa: F405
-        sample_input = (
-            torch.randn([1, 512]),
-            torch.randn([1, 8]),
-        )
-        for module in modules:
+        modules = [
+            SliceCopyDefaultParameter(),
+            SliceCopy(),
+            SliceCopyWithStep(),
+        ]  # noqa: F405
+        sample_inputs = [
+            (torch.randn([2, 1, 320, 512]),),
+            (torch.randn([1, 512]), torch.randn([1, 8])),
+            (torch.randn([1, 512]), torch.randn([1, 8])),
+        ]
+        for module, sample_input in zip(modules, sample_inputs):
             module = self.get_qdq_module(module, sample_input)
             self.lower_module_and_test_output(module, sample_input)