From 29d0a214f806480a1071776a99fcd1b550abfb1d Mon Sep 17 00:00:00 2001
From: pytorchbot <pytorchbot@pytorch.com>
Date: Sat, 22 Feb 2025 11:35:06 +0000
Subject: [PATCH] 2025-02-22 nightly release
 (54b371f75cae1b8226e07ff443ce8405df84871d)

---
 .ci/scripts/setup-linux.sh                    |   2 +-
 .ci/scripts/setup-macos.sh                    |   2 +-
 .ci/scripts/test_qnn_static_llama.sh          |   4 +-
 .ci/scripts/unittest-buck2.sh                 |  14 +
 .ci/scripts/unittest-linux-cmake.sh           |  13 +
 .ci/scripts/unittest-linux.sh                 |  48 ++
 .ci/scripts/unittest-macos-cmake.sh           |  13 +
 .ci/scripts/unittest-macos.sh                 |  52 ++
 .ci/scripts/utils.sh                          |   4 +-
 .github/ISSUE_TEMPLATE/bug-report.yml         |   2 +-
 .github/scripts/extract_benchmark_results.py  |  17 +-
 .github/workflows/_unittest.yml               |  56 +-
 .github/workflows/pull.yml                    |  12 +
 .github/workflows/trunk.yml                   |  12 +-
 CMakeLists.txt                                |   5 +
 CODEOWNERS                                    |  85 +++
 backends/apple/coreml/TARGETS                 |   1 +
 backends/arm/README.md                        |   4 +-
 backends/arm/_passes/TARGETS                  |   1 +
 backends/arm/_passes/arm_pass_manager.py      |   4 +
 backends/arm/_passes/convert_to_clamp.py      |  36 ++
 backends/arm/_passes/decompose_select.py      |   3 +-
 .../_passes/fuse_quantized_activation_pass.py |  10 +-
 backends/arm/_passes/insert_table_ops.py      |   1 +
 backends/arm/operator_support/TARGETS         |   3 +-
 .../tosa_supported_operators.py               | 175 +++++-
 backends/arm/operators/TARGETS                |   2 +-
 backends/arm/operators/__init__.py            |   4 +-
 backends/arm/operators/op_abs.py              | 133 ++++
 backends/arm/operators/op_hardtanh.py         |  66 --
 backends/arm/operators/op_relu.py             |  59 --
 backends/arm/operators/ops_unary.py           |  57 ++
 .../arm/quantizer/quantization_annotator.py   |   3 +
 backends/arm/scripts/build_executorch.sh      | 123 ++++
 .../arm/scripts/build_executorch_runner.sh    | 125 ++++
 .../arm/scripts/build_portable_kernels.sh     |  74 +++
 .../scripts/build_quantized_ops_aot_lib.sh    |  37 +-
 backends/arm/scripts/run_fvp.sh               | 104 ++++
 .../arm/test/misc/test_multiple_outputs.py    |  10 +-
 ...test_partition_decomposed_quantized_ops.py |  65 ++
 backends/arm/test/models/test_w2l_arm.py      | 150 +++++
 backends/arm/test/ops/test_abs.py             | 125 ++++
 backends/arm/test/ops/test_bmm.py             |   7 +-
 backends/arm/test/ops/test_cat.py             |   1 -
 backends/arm/test/ops/test_floor.py           |  82 +++
 backends/arm/test/ops/test_layer_norm.py      |  33 +-
 backends/arm/test/ops/test_logsoftmax.py      |  46 +-
 backends/arm/test/ops/test_mean_dim.py        |  12 +-
 backends/arm/test/ops/test_mm.py              |   6 +-
 backends/arm/test/ops/test_select.py          |   4 +-
 backends/arm/test/ops/test_softmax.py         |  53 +-
 backends/arm/test/ops/test_sum.py             |  12 +-
 backends/arm/test/ops/test_to_copy.py         |   4 +-
 backends/arm/test/ops/test_var.py             |  84 +--
 .../arm/test/passes/test_convert_to_clamp.py  |  80 +++
 backends/arm/test/runner_utils.py             |   2 +-
 backends/arm/test/setup_testing.sh            |   8 +-
 backends/arm/test/test_arm_baremetal.sh       | 120 ++--
 backends/arm/test/test_model.py               | 247 ++++++++
 backends/cadence/CMakeLists.txt               |  11 +-
 backends/cadence/aot/TARGETS                  |   2 +
 backends/cadence/aot/functions_hifi.yaml      |   5 -
 backends/cadence/aot/fuse_ops.py              |   8 +-
 backends/cadence/aot/memory_planning.py       |  28 +-
 backends/cadence/aot/ops_registrations.py     |  44 ++
 backends/cadence/aot/pass_utils.py            |  16 +-
 backends/cadence/aot/passes.py                |   6 +-
 backends/cadence/aot/quantizer/fusion_pass.py |  53 +-
 backends/cadence/aot/quantizer/patterns.py    |  33 +
 backends/cadence/aot/quantizer/quantizer.py   |  18 +-
 backends/cadence/aot/remove_ops.py            |   2 +
 backends/cadence/aot/reorder_ops.py           |   8 +
 backends/cadence/aot/replace_ops.py           |  20 +-
 .../cadence/aot/tests/test_memory_passes.py   |  23 +-
 .../fusion_g3/operators/CMakeLists.txt        |   3 +-
 backends/cadence/hifi/kernels/CMakeLists.txt  |   3 +-
 .../cadence/hifi/operators/CMakeLists.txt     |   5 +-
 backends/cadence/hifi/operators/op_clamp.cpp  |   2 +-
 backends/cadence/hifi/operators/op_mean.cpp   |   2 +-
 .../hifi/operators/op_quantized_relu_out.cpp  |   6 +-
 .../cadence/hifi/operators/op_softmax.cpp     |   2 +-
 backends/cadence/hifi/operators/op_where.cpp  |   9 +
 .../cadence/reference/kernels/CMakeLists.txt  |   3 +-
 .../reference/operators/CMakeLists.txt        |   3 +-
 backends/qualcomm/tests/test_qnn_delegate.py  | 247 ++++++--
 backends/qualcomm/tests/utils.py              |   2 +
 backends/transforms/fuse_view_copy.py         |  17 +
 backends/transforms/targets.bzl               |  14 +
 .../view_copy_to_squeeze_unsqueeze.py         |   6 +-
 backends/vulkan/_passes/TARGETS               |   7 +-
 backends/vulkan/_passes/__init__.py           |   6 +-
 ..._inputs.py => squeeze_unsqueeze_inputs.py} |  17 +-
 .../runtime/graph/ops/glsl/batchnorm.glsl     |  38 +-
 .../runtime/graph/ops/glsl/batchnorm.yaml     |   1 +
 .../runtime/graph/ops/impl/Convolution.cpp    |   7 +-
 backends/vulkan/vulkan_preprocess.py          |   4 +-
 .../xnnpack/partition/config/gemm_configs.py  |  15 +-
 backends/xnnpack/test/ops/test_linear.py      |  70 +++
 backends/xnnpack/test/ops/test_lstm.py        |   5 +-
 build/Utils.cmake                             |   2 +-
 .../executorch-arm-delegate-tutorial.md       |  31 +-
 examples/apple/coreml/llama/export.py         | 285 +++++++++
 .../apple/coreml/llama/llama_transformer.py   | 570 ++++++++++++++++++
 examples/apple/coreml/llama/readme.md         |  39 ++
 examples/apple/coreml/llama/run.py            | 134 ++++
 examples/arm/aot_arm_compiler.py              |   9 +-
 examples/arm/run.sh                           | 344 +++--------
 examples/arm/setup.sh                         |   3 +-
 examples/models/llama/TARGETS                 |  18 +-
 examples/models/llama/static_attention.py     | 103 +++-
 .../llama/tests/test_static_attention.py      | 203 ++++---
 .../llama3_2_vision/install_requirements.sh   |   4 +-
 examples/qualcomm/oss_scripts/llama/TARGETS   |  14 +-
 examples/qualcomm/oss_scripts/llama/llama.py  |  18 +-
 .../oss_scripts/llama/runner/runner.cpp       |  14 +
 exir/tests/test_joint_graph.py                |   4 +-
 .../pytorch/minibench/BenchmarkActivity.java  |   5 +
 .../apple/Benchmark/Tests/LLaMA/LLaMATests.mm |   2 +-
 extension/flat_tensor/serialize/targets.bzl   |   5 +-
 extension/flat_tensor/targets.bzl             |   4 +-
 extension/module/CMakeLists.txt               |   4 +-
 extension/module/module.cpp                   | 105 +++-
 extension/module/module.h                     |  27 +-
 extension/module/targets.bzl                  |   1 +
 extension/module/test/module_test.cpp         |  25 +-
 extension/module/test/resources/README.md     |  14 +-
 extension/module/test/resources/linear.ptd    | Bin 0 -> 336 bytes
 extension/module/test/resources/linear.pte    | Bin 0 -> 1208 bytes
 extension/pybindings/portable_lib.py          |   1 +
 extension/pybindings/pybindings.cpp           |  19 +
 extension/pybindings/pybindings.pyi           |   9 +
 extension/pybindings/test/TARGETS             |   8 +
 .../pybindings/test/test_backend_pybinding.py |  14 +
 kernels/optimized/cpu/op_log_softmax.cpp      |  32 +-
 kernels/optimized/cpu/op_sub.cpp              | 109 +---
 kernels/optimized/cpu/targets.bzl             |  14 +-
 .../portable/cpu/util/test/broadcast_test.cpp |   2 +
 kernels/test/op_sub_test.cpp                  | 116 ++++
 runtime/__init__.py                           |  18 +-
 runtime/backend/interface.cpp                 |  11 +
 runtime/backend/interface.h                   |  10 +
 runtime/core/array_ref.h                      |   3 +-
 runtime/core/hierarchical_allocator.h         |   3 +-
 runtime/core/portable_type/c10/README.md      |  14 +-
 .../portable_type/c10/c10/macros/Export.h     |   2 +
 .../core/portable_type/c10/c10/targets.bzl    |   3 +-
 .../portable_type/c10/c10/util/BFloat16.h     |   4 -
 .../core/portable_type/c10/c10/util/Half.h    |   4 -
 .../core/portable_type/c10/c10/util/irange.h  | 123 ++++
 runtime/core/portable_type/targets.bzl        |   3 +
 runtime/core/portable_type/tensor_impl.cpp    |   4 +-
 .../core/portable_type/test/bfloat16_test.cpp |   9 +-
 runtime/core/portable_type/test/targets.bzl   |   2 +
 .../portable_type/test/tensor_impl_test.cpp   |   3 +-
 runtime/core/result.h                         |   9 +-
 runtime/core/targets.bzl                      |   9 +-
 runtime/core/tensor_layout.cpp                |   3 +-
 runtime/core/test/error_handling_test.cpp     |   1 +
 runtime/core/test/event_tracer_test.cpp       |   5 +-
 runtime/core/test/memory_allocator_test.cpp   |   7 +-
 runtime/core/test/targets.bzl                 |   2 +
 runtime/executor/method.cpp                   | 149 ++++-
 runtime/executor/method.h                     |  36 ++
 runtime/executor/tensor_parser.h              |  27 +-
 runtime/executor/tensor_parser_aten.cpp       |   6 +-
 runtime/executor/tensor_parser_exec_aten.cpp  | 134 ++--
 runtime/executor/tensor_parser_portable.cpp   |   7 +-
 runtime/kernel/targets.bzl                    |   1 +
 .../executorch/build/runtime_wrapper.bzl      |   2 +-
 test/utils/targets.bzl                        |   1 +
 170 files changed, 4960 insertions(+), 1254 deletions(-)
 create mode 100755 .ci/scripts/unittest-buck2.sh
 create mode 100755 .ci/scripts/unittest-linux-cmake.sh
 create mode 100755 .ci/scripts/unittest-linux.sh
 create mode 100755 .ci/scripts/unittest-macos-cmake.sh
 create mode 100755 .ci/scripts/unittest-macos.sh
 create mode 100644 CODEOWNERS
 create mode 100644 backends/arm/_passes/convert_to_clamp.py
 create mode 100644 backends/arm/operators/op_abs.py
 delete mode 100644 backends/arm/operators/op_hardtanh.py
 delete mode 100644 backends/arm/operators/op_relu.py
 create mode 100644 backends/arm/operators/ops_unary.py
 create mode 100755 backends/arm/scripts/build_executorch.sh
 create mode 100755 backends/arm/scripts/build_executorch_runner.sh
 create mode 100755 backends/arm/scripts/build_portable_kernels.sh
 create mode 100755 backends/arm/scripts/run_fvp.sh
 create mode 100644 backends/arm/test/misc/test_partition_decomposed_quantized_ops.py
 create mode 100644 backends/arm/test/models/test_w2l_arm.py
 create mode 100644 backends/arm/test/ops/test_abs.py
 create mode 100644 backends/arm/test/ops/test_floor.py
 create mode 100644 backends/arm/test/passes/test_convert_to_clamp.py
 create mode 100755 backends/arm/test/test_model.py
 rename backends/vulkan/_passes/{squeeze_int4_linear_inputs.py => squeeze_unsqueeze_inputs.py} (80%)
 create mode 100644 examples/apple/coreml/llama/export.py
 create mode 100644 examples/apple/coreml/llama/llama_transformer.py
 create mode 100644 examples/apple/coreml/llama/readme.md
 create mode 100644 examples/apple/coreml/llama/run.py
 create mode 100644 extension/module/test/resources/linear.ptd
 create mode 100644 extension/module/test/resources/linear.pte
 create mode 100644 extension/pybindings/test/test_backend_pybinding.py
 create mode 100644 runtime/core/portable_type/c10/c10/util/irange.h

diff --git a/.ci/scripts/setup-linux.sh b/.ci/scripts/setup-linux.sh
index 36fbcd7274..776bf6f795 100755
--- a/.ci/scripts/setup-linux.sh
+++ b/.ci/scripts/setup-linux.sh
@@ -22,7 +22,7 @@ fi
 # have already been installed, so we use PyTorch build from source here instead
 # of nightly. This allows CI to test against latest commits from PyTorch
 install_executorch "use-pt-pinned-commit"
-build_executorch_runner "${BUILD_TOOL}"
+build_executorch_runner "${BUILD_TOOL}" "${2:-Release}"
 
 if [[ "${GITHUB_BASE_REF:-}" == *main* || "${GITHUB_BASE_REF:-}" == *gh* ]]; then
   do_not_use_nightly_on_ci
diff --git a/.ci/scripts/setup-macos.sh b/.ci/scripts/setup-macos.sh
index 033c299603..bb8e45f23f 100755
--- a/.ci/scripts/setup-macos.sh
+++ b/.ci/scripts/setup-macos.sh
@@ -136,7 +136,7 @@ install_pytorch_and_domains
 # We build PyTorch from source here instead of using nightly. This allows CI to test against
 # the pinned commit from PyTorch
 install_executorch "use-pt-pinned-commit"
-build_executorch_runner "${BUILD_TOOL}"
+build_executorch_runner "${BUILD_TOOL}" "${2:-Release}"
 
 if [[ "${GITHUB_BASE_REF:-}" == *main* || "${GITHUB_BASE_REF:-}" == *gh* ]]; then
   do_not_use_nightly_on_ci
diff --git a/.ci/scripts/test_qnn_static_llama.sh b/.ci/scripts/test_qnn_static_llama.sh
index 8aab21846f..5df74bddef 100644
--- a/.ci/scripts/test_qnn_static_llama.sh
+++ b/.ci/scripts/test_qnn_static_llama.sh
@@ -34,11 +34,11 @@ $PYTHON_EXECUTABLE -m extension.llm.tokenizer.tokenizer -t tokenizer.model -o to
 
 set +e
 # Compile only as weight sharing is not applicable on x86
-$PYTHON_EXECUTABLE backends/qualcomm/tests/test_qnn_delegate.py -k TestExampleScript.test_stories_single_llama --model SM8650 --build_folder build-android/ --executorch_root . --artifact_dir . --compile_only
+$PYTHON_EXECUTABLE backends/qualcomm/tests/test_qnn_delegate.py -k TestExampleLLMScript.test_llama_stories_110m --model SM8650 --build_folder build-android/ --executorch_root . --artifact_dir . --llama_artifacts . --compile_only
 exit_code1=$?
 
 # Checks accuracy with weight sharing disabled since x86 does not support weight sharing.
-$PYTHON_EXECUTABLE backends/qualcomm/tests/test_qnn_delegate.py -k TestExampleScript.test_stories_single_llama --model SM8650 --build_folder build-x86/ --executorch_root . --artifact_dir . --enable_x86_64
+$PYTHON_EXECUTABLE backends/qualcomm/tests/test_qnn_delegate.py -k TestExampleLLMScript.test_llama_stories_110m --model SM8650 --build_folder build-x86/ --executorch_root . --artifact_dir . --llama_artifacts . --enable_x86_64
 exit_code2=$?
 
 # Check the exit codes and print messages
diff --git a/.ci/scripts/unittest-buck2.sh b/.ci/scripts/unittest-buck2.sh
new file mode 100755
index 0000000000..2e38657050
--- /dev/null
+++ b/.ci/scripts/unittest-buck2.sh
@@ -0,0 +1,14 @@
+#!/usr/bin/env bash
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+set -eux
+
+# TODO: expand this to //...
+buck2 query //runtime/...
+
+# TODO: expand the covered scope of Buck targets.
+buck2 build //runtime/core/portable_type/...
+buck2 test //runtime/core/portable_type/...
diff --git a/.ci/scripts/unittest-linux-cmake.sh b/.ci/scripts/unittest-linux-cmake.sh
new file mode 100755
index 0000000000..7b61256eb5
--- /dev/null
+++ b/.ci/scripts/unittest-linux-cmake.sh
@@ -0,0 +1,13 @@
+#!/usr/bin/env bash
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+set -eux
+
+# Run pytest with coverage
+pytest -n auto --cov=./ --cov-report=xml
+# Run gtest
+LLVM_PROFDATA=llvm-profdata-12 LLVM_COV=llvm-cov-12 \
+test/run_oss_cpp_tests.sh
diff --git a/.ci/scripts/unittest-linux.sh b/.ci/scripts/unittest-linux.sh
new file mode 100755
index 0000000000..e76b43fa22
--- /dev/null
+++ b/.ci/scripts/unittest-linux.sh
@@ -0,0 +1,48 @@
+#!/usr/bin/env bash
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+set -eux
+
+BUILD_TOOL=$1
+if [[ $BUILD_TOOL =~ ^(cmake|buck2)$ ]]; then
+    echo "Running unittests for ${BUILD_TOOL} ..."
+else
+  echo "Missing build tool (require buck2 or cmake), exiting..."
+  exit 1
+fi
+
+BUILD_MODE=$2
+if [[ "${BUILD_MODE:-}" =~ ^(Debug|Release)$ ]]; then
+    echo "Running tests in build mode ${BUILD_MODE} ..."
+else
+    echo "Unsupported build mode ${BUILD_MODE}, options are Debug or Release."
+    exit 1
+fi
+
+# The generic Linux job chooses to use base env, not the one setup by the image
+eval "$(conda shell.bash hook)"
+CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
+conda activate "${CONDA_ENV}"
+
+# Setup swiftshader and Vulkan SDK which are required to build the Vulkan delegate
+source .ci/scripts/setup-vulkan-linux-deps.sh
+
+PYTHON_EXECUTABLE=python \
+EXECUTORCH_BUILD_PYBIND=ON \
+CMAKE_ARGS="-DEXECUTORCH_BUILD_XNNPACK=ON -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON" \
+.ci/scripts/setup-linux.sh "$BUILD_TOOL" "$BUILD_MODE"
+
+# Install llama3_2_vision dependencies.
+PYTHON_EXECUTABLE=python ./examples/models/llama3_2_vision/install_requirements.sh
+
+if [[ "$BUILD_TOOL" == "cmake" ]]; then
+    .ci/scripts/unittest-linux-cmake.sh
+elif [[ "$BUILD_TOOL" == "buck2" ]]; then
+    .ci/scripts/unittest-buck2.sh
+else
+    echo "Unknown build tool $BUILD_TOOL"
+    exit 1
+fi
diff --git a/.ci/scripts/unittest-macos-cmake.sh b/.ci/scripts/unittest-macos-cmake.sh
new file mode 100755
index 0000000000..cdb40c4024
--- /dev/null
+++ b/.ci/scripts/unittest-macos-cmake.sh
@@ -0,0 +1,13 @@
+#!/usr/bin/env bash
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+set -eux
+
+# Run pytest with coverage
+${CONDA_RUN} pytest -n auto --cov=./ --cov-report=xml
+# Run gtest
+LLVM_PROFDATA="xcrun llvm-profdata" LLVM_COV="xcrun llvm-cov" \
+${CONDA_RUN} test/run_oss_cpp_tests.sh
diff --git a/.ci/scripts/unittest-macos.sh b/.ci/scripts/unittest-macos.sh
new file mode 100755
index 0000000000..c0e39cee33
--- /dev/null
+++ b/.ci/scripts/unittest-macos.sh
@@ -0,0 +1,52 @@
+#!/usr/bin/env bash
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+set -eux
+
+BUILD_TOOL=$1
+if [[ $BUILD_TOOL =~ ^(cmake|buck2)$ ]]; then
+    echo "Running unittests for ${BUILD_TOOL} ..."
+else
+  echo "Missing build tool (require buck2 or cmake), exiting..."
+  exit 1
+fi
+
+BUILD_MODE=$2
+if [[ $BUILD_MODE =~ ^(Debug|Release)$ ]]; then
+    echo "Running tests in build mode ${BUILD_MODE} ..."
+else
+    echo "Unsupported build mode ${BUILD_MODE}, options are Debug or Release."
+    exit 1
+fi
+
+bash .ci/scripts/setup-conda.sh
+eval "$(conda shell.bash hook)"
+
+# Create temp directory for sccache shims
+export TMP_DIR=$(mktemp -d)
+export PATH="${TMP_DIR}:$PATH"
+trap 'rm -rfv ${TMP_DIR}' EXIT
+
+# Setup MacOS dependencies as there is no Docker support on MacOS atm
+PYTHON_EXECUTABLE=python \
+EXECUTORCH_BUILD_PYBIND=ON \
+CMAKE_ARGS="-DEXECUTORCH_BUILD_COREML=ON -DEXECUTORCH_BUILD_MPS=ON -DEXECUTORCH_BUILD_XNNPACK=ON -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON" \
+${CONDA_RUN} --no-capture-output \
+.ci/scripts/setup-macos.sh "${BUILD_TOOL}" "${BUILD_MODE}"
+
+# Install llama3_2_vision dependencies.
+PYTHON_EXECUTABLE=python \
+${CONDA_RUN} --no-capture-output \
+./examples/models/llama3_2_vision/install_requirements.sh
+
+if [[ "$BUILD_TOOL" == "cmake" ]]; then
+    .ci/scripts/unittest-macos-cmake.sh
+elif [[ "$BUILD_TOOL" == "buck2" ]]; then
+    .ci/scripts/unittest-buck2.sh
+else
+    echo "Unknown build tool $BUILD_TOOL"
+    exit 1
+fi
diff --git a/.ci/scripts/utils.sh b/.ci/scripts/utils.sh
index be684b7bfa..e0bc935e86 100644
--- a/.ci/scripts/utils.sh
+++ b/.ci/scripts/utils.sh
@@ -109,7 +109,7 @@ build_executorch_runner_cmake() {
   pushd "${CMAKE_OUTPUT_DIR}" || return
   # This command uses buck2 to gather source files and buck2 could crash flakily
   # on MacOS
-  retry cmake -DPYTHON_EXECUTABLE="${PYTHON_EXECUTABLE}" -DCMAKE_BUILD_TYPE=Release ..
+  retry cmake -DPYTHON_EXECUTABLE="${PYTHON_EXECUTABLE}" -DCMAKE_BUILD_TYPE="${1:-Release}" ..
   popd || return
 
   if [ "$(uname)" == "Darwin" ]; then
@@ -124,7 +124,7 @@ build_executorch_runner() {
   if [[ $1 == "buck2" ]]; then
     build_executorch_runner_buck2
   elif [[ $1 == "cmake" ]]; then
-    build_executorch_runner_cmake
+    build_executorch_runner_cmake "$2"
   else
     echo "Invalid build tool $1. Only buck2 and cmake are supported atm"
     exit 1
diff --git a/.github/ISSUE_TEMPLATE/bug-report.yml b/.github/ISSUE_TEMPLATE/bug-report.yml
index 86363e7da9..010f7c1132 100644
--- a/.github/ISSUE_TEMPLATE/bug-report.yml
+++ b/.github/ISSUE_TEMPLATE/bug-report.yml
@@ -21,7 +21,7 @@ body:
       A clear and concise description of what the bug is.
 
       ```python
-      # Sample code to reproduce the problem
+      # Sample code to reproduce the problem. If applicable, also include your model export command.
       ```
 
       ```
diff --git a/.github/scripts/extract_benchmark_results.py b/.github/scripts/extract_benchmark_results.py
index 76f0e53338..ba6142a482 100755
--- a/.github/scripts/extract_benchmark_results.py
+++ b/.github/scripts/extract_benchmark_results.py
@@ -229,11 +229,7 @@ def extract_ios_metric(
 
     elif method == "forward":
         if metric_name == "Clock Monotonic Time, s":
-            benchmark_result["metric"] = (
-                "generate_time(ms)"
-                if "llama" in test_name
-                else "avg_inference_latency(ms)"
-            )
+            benchmark_result["metric"] = "avg_inference_latency(ms)"
             benchmark_result["actualValue"] = metric_value * 1000
 
         elif metric_name == "Memory Peak Physical, kB":
@@ -241,9 +237,14 @@ def extract_ios_metric(
             benchmark_result["metric"] = "peak_inference_mem_usage(mb)"
             benchmark_result["actualValue"] = metric_value / 1024
 
-    elif method == "generate" and metric_name == "Tokens Per Second, t/s":
-        benchmark_result["metric"] = "token_per_sec"
-        benchmark_result["actualValue"] = metric_value
+    elif method == "generate":
+        if metric_name == "Clock Monotonic Time, s":
+            benchmark_result["metric"] = "generate_time(ms)"
+            benchmark_result["actualValue"] = metric_value * 1000
+
+        elif metric_name == "Tokens Per Second, t/s":
+            benchmark_result["metric"] = "token_per_sec"
+            benchmark_result["actualValue"] = metric_value
 
     return benchmark_result
 
diff --git a/.github/workflows/_unittest.yml b/.github/workflows/_unittest.yml
index 414f86494b..f2eb2cfdb5 100644
--- a/.github/workflows/_unittest.yml
+++ b/.github/workflows/_unittest.yml
@@ -7,6 +7,14 @@ on:
         required: true
         type: string
         description: Name of the docker image to use.
+      build-mode:
+        required: true
+        type: string
+        description: Build mode to use, Debug or Release.
+      build-tool:
+        required: true
+        type: string
+        description: Build tool to use, cmake or buck2.
       python-version:
         required: false
         type: string
@@ -26,28 +34,7 @@ jobs:
       timeout: 90
       script: |
         set -eux
-
-        # The generic Linux job chooses to use base env, not the one setup by the image
-        CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
-        conda activate "${CONDA_ENV}"
-
-        # Setup swiftshader and Vulkan SDK which are required to build the Vulkan delegate
-        source .ci/scripts/setup-vulkan-linux-deps.sh
-
-        # Setup MacOS dependencies as there is no Docker support on MacOS atm
-        PYTHON_EXECUTABLE=python \
-        EXECUTORCH_BUILD_PYBIND=ON \
-        CMAKE_ARGS="-DEXECUTORCH_BUILD_XNNPACK=ON -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON" \
-        .ci/scripts/setup-linux.sh cmake
-
-        # Install llama3_2_vision dependencies.
-        PYTHON_EXECUTABLE=python ./examples/models/llama3_2_vision/install_requirements.sh
-
-        # Run pytest with coverage
-        pytest -n auto --cov=./ --cov-report=xml
-        # Run gtest
-        LLVM_PROFDATA=llvm-profdata-12 LLVM_COV=llvm-cov-12 \
-        test/run_oss_cpp_tests.sh
+        .ci/scripts/unittest-linux.sh "${{ inputs.build-tool }}" "${{ inputs.build-mode }}"
 
   macos:
     uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
@@ -58,27 +45,4 @@ jobs:
       ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
       script: |
         set -eux
-
-        bash .ci/scripts/setup-conda.sh
-
-        # Create temp directory for sccache shims
-        export TMP_DIR=$(mktemp -d)
-        export PATH="${TMP_DIR}:$PATH"
-        trap 'rm -rfv ${TMP_DIR}' EXIT
-
-        # Setup MacOS dependencies as there is no Docker support on MacOS atm
-        PYTHON_EXECUTABLE=python \
-        EXECUTORCH_BUILD_PYBIND=ON \
-        CMAKE_ARGS="-DEXECUTORCH_BUILD_COREML=ON -DEXECUTORCH_BUILD_MPS=ON -DEXECUTORCH_BUILD_XNNPACK=ON -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON" \
-        ${CONDA_RUN} --no-capture-output \
-        .ci/scripts/setup-macos.sh cmake
-
-        # Install llama3_2_vision dependencies.
-        PYTHON_EXECUTABLE=python ${CONDA_RUN} --no-capture-output \
-        ./examples/models/llama3_2_vision/install_requirements.sh
-
-        # Run pytest with coverage
-        ${CONDA_RUN} pytest -n auto --cov=./ --cov-report=xml
-        # Run gtest
-        LLVM_PROFDATA="xcrun llvm-profdata" LLVM_COV="xcrun llvm-cov" \
-        ${CONDA_RUN} test/run_oss_cpp_tests.sh
+        .ci/scripts/unittest-macos.sh "${{ inputs.build-tool }}" "${{ inputs.build-mode }}"
diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml
index fac2319789..b599f2fdc6 100644
--- a/.github/workflows/pull.yml
+++ b/.github/workflows/pull.yml
@@ -367,6 +367,18 @@ jobs:
       id-token: write
       contents: read
     with:
+      build-mode: Debug
+      build-tool: cmake
+      docker-image: executorch-ubuntu-22.04-clang12
+
+  unittest-buck:
+    uses: ./.github/workflows/_unittest.yml
+    permissions:
+      id-token: write
+      contents: read
+    with:
+      build-mode: Debug
+      build-tool: buck2
       docker-image: executorch-ubuntu-22.04-clang12
 
   unittest-arm:
diff --git a/.github/workflows/trunk.yml b/.github/workflows/trunk.yml
index 04a6c96f3e..64e2684787 100644
--- a/.github/workflows/trunk.yml
+++ b/.github/workflows/trunk.yml
@@ -159,7 +159,7 @@ jobs:
         sudo sysctl fs.inotify.max_user_watches=1048576 # 1024 * 1024
 
         # Test ethos-u delegate examples with run.sh
-        backends/arm/test/test_arm_baremetal.sh test_run_ethosu_fvp
+        backends/arm/test/test_arm_baremetal.sh test_full_ethosu_fvp
 
 
   test-arm-reference-delegation:
@@ -489,3 +489,13 @@ jobs:
         PYTHON_EXECUTABLE=python bash examples/models/llama/install_requirements.sh
         # Test llama2
         PYTHON_EXECUTABLE=python bash .ci/scripts/test_llama.sh -model stories110M -build_tool "${BUILD_TOOL}" -mode "${MODE}" -dtype "${DTYPE}" -pt2e_quantize "${PT2E_QUANTIZE}"
+
+  unittest-release:
+    uses: ./.github/workflows/_unittest.yml
+    permissions:
+      id-token: write
+      contents: read
+    with:
+      build-mode: Release
+      build-tool: cmake
+      docker-image: executorch-ubuntu-22.04-clang12
diff --git a/CMakeLists.txt b/CMakeLists.txt
index be0921a0b5..01ad728c42 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -258,6 +258,11 @@ if(EXECUTORCH_BUILD_EXTENSION_TRAINING)
   set(EXECUTORCH_BUILD_EXTENSION_MODULE ON)
 endif()
 
+if(EXECUTORCH_BUILD_EXTENSION_MODULE)
+  set(EXECUTORCH_BUILD_EXTENSION_DATA_LOADER ON)
+  set(EXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR ON)
+endif()
+
 if(EXECUTORCH_BUILD_KERNELS_CUSTOM_AOT)
   set(EXECUTORCH_BUILD_EXTENSION_TENSOR ON)
   set(EXECUTORCH_BUILD_KERNELS_CUSTOM ON)
diff --git a/CODEOWNERS b/CODEOWNERS
new file mode 100644
index 0000000000..7a9d2a88f8
--- /dev/null
+++ b/CODEOWNERS
@@ -0,0 +1,85 @@
+# IMPORTANT:
+# This file is ONLY used to subscribe for notifications for PRs
+# related to a specific file path. Approvals from people in this
+# file are not required for merges.
+
+/backends/apple @shoumikhin @cccclai
+/backends/apple/mps @cccclai @DenisVieriu97
+/backends/arm @digantdesai
+/backends/cadence @tarun292
+/backends/example @iseeyuan @JacobSzwejbka @larryliu0820
+/backends/mediatek @cccclai @neuropilot-captain
+/backends/qualcomm @cccclai @chunit-quic @haowhsu-quic @shewu-quic @winskuo-quic
+/backends/test @cccclai
+/backends/transforms @kimishpatel
+/backends/vulkan @SS-JIA
+/backends/xnnpack @digantdesai @mcr229
+
+/build @GregoryComer @dbort @kirklandsign
+
+/codegen @larryliu0820 @lucylq
+
+/devtools @tarun292 @Gasoonjia
+
+/docs @mergennachin
+
+/examples/apple @shoumikhin
+/examples/apple/coreml @cccclai @metascroy @cymbalrush @YifanShenSZ
+/examples/arm @digantdesai
+/examples/cadence @tarun292
+/examples/demo-apps @shoumikhin @kirklandsign
+/examples/devtools @tarun292
+/examples/llm_manual @larryliu0820
+/examples/llm_pte_finetuning @JacobSzwejbka
+/examples/mediatek @cccclai
+/examples/models @lucylq
+/examples/portable @larryliu0820 @manuelcandales
+/examples/qualcomm @cccclai
+/examples/selective_build @lucylq @larryliu0820 @JacobSzwejbka
+/examples/xnnpack @digantdesai @mcr229
+
+/exir/backend @cccclai @kimishpatel @JacobSzwejbka @tarun292
+/exir @JacobSzwejbka @tarun292 @larryliu0820
+
+
+/extension/android @kirklandsign
+/extension/android_test @kirklandsign
+/extension/apple @shoumikhin
+/extension/aten_util @JacobSzwejbka
+/extension/benchmark @tarun292
+/extension/data_loader @JacobSzwejbka @lucylq @dbort
+/extension/evalue_util @GregoryComer @dbort
+/extension/export_util @kimishpatel
+/extension/flat_tensor @lucylq
+/extension/gguf_util @larryliu0820
+/extension/kernel_util @kimishpatel @manuelcandales
+/extension/llm @jackzhxng @iseeyuan @larryliu0820
+/extension/memory_allocator @JacobSzwejbka @dbort
+/extension/module @shoumikhin
+/extension/parallel @kimishpatel
+/extension/pybindings @JacobSzwejbka @larryliu0820
+/extension/pytree @JacobSzwejbka
+/extension/runner_util @dbort
+/extension/tensor @shoumikhin
+/extension/testing_util @dbort
+/extension/threadpool @kimishpatel
+/extension/training @JacobSzwejbka
+
+/kernels @manuelcandales
+
+/profiler @tarun292 @Gasoonjia
+
+/runtime @dbort @JacobSzwejbka @lucylq
+/runtime/backend @cccclai
+
+/schema @dbort @JacobSzwejbka @lucylq
+
+/scripts @GregoryComer
+
+/shim @larryliu0820 @GregoryComer
+
+/third-party @GregoryComer
+
+/test @larryliu0820 @kirklandsign
+
+/util @tarun292
diff --git a/backends/apple/coreml/TARGETS b/backends/apple/coreml/TARGETS
index 9d722457e3..d77e33679a 100644
--- a/backends/apple/coreml/TARGETS
+++ b/backends/apple/coreml/TARGETS
@@ -76,6 +76,7 @@ runtime.cxx_python_extension(
     base_module = "",
     visibility = [
         "//executorch/examples/apple/coreml/...",
+        "@EXECUTORCH_CLIENTS",
     ],
     external_deps = [
         "pybind11",
diff --git a/backends/arm/README.md b/backends/arm/README.md
index 9a5a6f9408..04815bf23d 100644
--- a/backends/arm/README.md
+++ b/backends/arm/README.md
@@ -55,10 +55,10 @@ To run the unit test suite with Corstone3x0 FVP simulator support use
 backends/arm/test/test_arm_baremetal.sh test_pytest_ethosu_fvp
 ```
 
-You can test to run some models with the run.sh flow
+You can test to run some models with the full fvp test flow
 
 ```
-backends/arm/test/test_arm_baremetal.sh test_run_ethosu_fvp
+backends/arm/test/test_arm_baremetal.sh test_full_ethosu_fvp
 ```
 
 ## Unit tests
diff --git a/backends/arm/_passes/TARGETS b/backends/arm/_passes/TARGETS
index 6ca59cfee2..843d6b159d 100644
--- a/backends/arm/_passes/TARGETS
+++ b/backends/arm/_passes/TARGETS
@@ -7,6 +7,7 @@ python_library(
     deps = [
         "//executorch/backends/arm:tosa_quant_utils",
         "//executorch/backends/arm:tosa_utils",
+        "//executorch/backends/transforms:replace_scalar_with_tensor",
         "//executorch/backends/xnnpack/_passes:xnnpack_passes",
         "//executorch/exir:lib",
     ],
diff --git a/backends/arm/_passes/arm_pass_manager.py b/backends/arm/_passes/arm_pass_manager.py
index 28d70591e5..331d45e912 100644
--- a/backends/arm/_passes/arm_pass_manager.py
+++ b/backends/arm/_passes/arm_pass_manager.py
@@ -27,6 +27,7 @@
 from executorch.backends.arm._passes.convert_squeezes_to_view import (  # type: ignore[import-not-found]
     ConvertSqueezesToViewPass,
 )
+from executorch.backends.arm._passes.convert_to_clamp import ConvertToClampPass
 from executorch.backends.arm._passes.decompose_batchnorm_pass import (
     DecomposeBatchNormPass,
 )
@@ -104,6 +105,7 @@ def _tosa_080_BI_pipeline(self, exported_program: ExportedProgram) -> GraphModul
         self.add_pass(DecomposeLinearPass())
         self.add_pass(ConvertMeanDimToAveragePoolPass())
         self.add_pass(ConvertFullLikeToFullPass())
+        self.add_pass(ConvertToClampPass())
 
         self.add_pass(ReplaceScalarWithTensorArgPass())
         self.add_pass(AnnotateDecomposedMatmulPass())
@@ -144,6 +146,8 @@ def _tosa_080_MI_pipeline(self, exported_program: ExportedProgram) -> GraphModul
         self.add_pass(DecomposeDivPass())
         self.add_pass(DecomposeSoftmaxesPass())
         self.add_pass(ConvertFullLikeToFullPass())
+        self.add_pass(ConvertToClampPass())
+
         self.add_pass(AnnotateDecomposedMatmulPass())
         self.add_pass(QuantizeOperatorArguments())
         self.add_pass(FoldAndAnnotateQParamsPass())  # type: ignore[call-arg]
diff --git a/backends/arm/_passes/convert_to_clamp.py b/backends/arm/_passes/convert_to_clamp.py
new file mode 100644
index 0000000000..8f2c9b16f9
--- /dev/null
+++ b/backends/arm/_passes/convert_to_clamp.py
@@ -0,0 +1,36 @@
+# Copyright 2025 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import Tuple
+
+from executorch.exir.dialects._ops import ops as exir_ops
+from executorch.exir.pass_base import ExportPass
+
+edge_operators = {
+    exir_ops.edge.aten.hardtanh.default,
+    exir_ops.edge.aten.relu.default,
+}
+
+
+def get_clamp_params(op, args) -> Tuple[float | None, float | None]:
+    if op == exir_ops.edge.aten.hardtanh.default:
+        return args[1], args[2]
+    elif op == exir_ops.edge.aten.relu.default:
+        return 0.0, None
+    else:
+        raise ValueError(f"Getting clamp parameters for op {op} is not implemented.")
+
+
+class ConvertToClampPass(ExportPass):
+    def call_operator(self, op, args, kwargs, meta):
+        if op not in edge_operators:
+            return super().call_operator(op, args, kwargs, meta)
+
+        return super().call_operator(
+            exir_ops.edge.aten.clamp.default,
+            (args[0], *get_clamp_params(op, args)),
+            {},
+            meta,
+        )
diff --git a/backends/arm/_passes/decompose_select.py b/backends/arm/_passes/decompose_select.py
index 5e04668df9..9a25b7c28a 100644
--- a/backends/arm/_passes/decompose_select.py
+++ b/backends/arm/_passes/decompose_select.py
@@ -35,8 +35,9 @@ def call(self, graph_module: torch.fx.GraphModule):
             input_node, dim, index = node.args
 
             rank = len(input_node.meta["val"].size())
+            shape = input_node.meta["val"].shape
             dim = dim % rank if dim < 0 else dim
-            index = index % rank if index < 0 else index
+            index = index % shape[dim] if index < 0 else index
 
             with graph_module.graph.inserting_before(node):
                 slice_node = create_node(
diff --git a/backends/arm/_passes/fuse_quantized_activation_pass.py b/backends/arm/_passes/fuse_quantized_activation_pass.py
index 3ac9f5cbb9..13c69bf92f 100644
--- a/backends/arm/_passes/fuse_quantized_activation_pass.py
+++ b/backends/arm/_passes/fuse_quantized_activation_pass.py
@@ -13,7 +13,8 @@
 
 
 class FuseQuantizedActivationPass(ExportPass):
-    def _is_fuseable_quantized_activation(self, node: Node):
+    @staticmethod
+    def _is_fuseable_quantized_activation(node: Node):
         """Fuse activations that have a 0 lower bound and quantized with a qmin zero-point"""
         is_fuseable = node.target == exir_ops.edge.aten.relu.default
         if node.target == exir_ops.edge.aten.hardtanh.default:
@@ -29,7 +30,8 @@ def _is_fuseable_quantized_activation(self, node: Node):
         else:
             return False
 
-    def _is_fuseable_input(self, node: Node):
+    @staticmethod
+    def _is_fuseable_input(node: Node):
         return (
             node.target
             in (
@@ -45,11 +47,11 @@ def call(self, graph_module: torch.fx.GraphModule):
             if node.op != "call_function":
                 continue
 
-            if not self._is_fuseable_quantized_activation(node):
+            if not FuseQuantizedActivationPass._is_fuseable_quantized_activation(node):
                 continue
 
             input_node = node.args[0]
-            if not self._is_fuseable_input(input_node):
+            if not FuseQuantizedActivationPass._is_fuseable_input(input_node):
                 continue
 
             node.replace_all_uses_with(input_node)
diff --git a/backends/arm/_passes/insert_table_ops.py b/backends/arm/_passes/insert_table_ops.py
index ada4d646c0..77de46fcd2 100644
--- a/backends/arm/_passes/insert_table_ops.py
+++ b/backends/arm/_passes/insert_table_ops.py
@@ -39,6 +39,7 @@ class InsertTableOpsPass(ExportPass):
 
     table_ops: Dict[EdgeOpOverload, Callable[[torch.Tensor], torch.Tensor]] = {
         exir_ops.edge.aten.exp.default: torch.exp,
+        exir_ops.edge.aten.floor.default: torch.floor,
         exir_ops.edge.aten.log.default: torch.log,
         exir_ops.edge.aten.reciprocal.default: torch.reciprocal,
         exir_ops.edge.aten.rsqrt.default: torch.rsqrt,
diff --git a/backends/arm/operator_support/TARGETS b/backends/arm/operator_support/TARGETS
index eb8c78bcf2..0de9f060bf 100644
--- a/backends/arm/operator_support/TARGETS
+++ b/backends/arm/operator_support/TARGETS
@@ -5,8 +5,9 @@ python_library(
     srcs = glob(["*.py"]),
     typing = True,
     deps = [
+        "//executorch/backends/arm/_passes:passes",
+        "//executorch/backends/arm:tosa_specification",
         "//executorch/backends/xnnpack/_passes:xnnpack_passes",
         "//executorch/exir:lib",
-        "//executorch/backends/arm:tosa_specification"
     ],
 )
diff --git a/backends/arm/operator_support/tosa_supported_operators.py b/backends/arm/operator_support/tosa_supported_operators.py
index 1fa626efce..6fe70aa696 100644
--- a/backends/arm/operator_support/tosa_supported_operators.py
+++ b/backends/arm/operator_support/tosa_supported_operators.py
@@ -5,13 +5,22 @@
 
 # pyre-unsafe
 
+import itertools
 import operator
+import typing
 from typing import final, Optional, Sequence, Type
 
+import torch
+
 import torch.fx as fx
+from executorch.backends.arm._passes.arm_pass_utils import get_first_fake_tensor
+from executorch.backends.arm._passes.fuse_quantized_activation_pass import (
+    FuseQuantizedActivationPass,
+)
 from executorch.backends.arm.tosa_specification import TosaSpecification
 from executorch.exir.dialects._ops import ops as exir_ops
 from torch.fx.passes.operator_support import any_chain, chain, OperatorSupportBase
+from torch.fx.passes.utils.source_matcher_utils import get_source_partitions
 
 
 class SupportedTOSAOperatorCheck(OperatorSupportBase):
@@ -27,7 +36,9 @@ def __init__(self, tosa_spec: TosaSpecification):
     targets: list[str] = []
 
     @final
-    def is_node_supported(self, submodules, node: fx.Node) -> bool:
+    def is_node_supported(
+        self, submodules: typing.Mapping[str, torch.nn.Module], node: fx.Node
+    ) -> bool:
         if node.target not in self.targets:
             return False
         return self.is_node_tosa_supported(node, self.tosa_spec)
@@ -75,6 +86,10 @@ def tosa_support_factory(
     tosa_spec: TosaSpecification,
     additional_checks: Optional[Sequence[OperatorSupportBase]] = None,
 ) -> OperatorSupportBase:
+    negative_checks: list[OperatorSupportBase] = []
+    if not tosa_spec.support_float():
+        negative_checks.append(NeedsDecompositionCheck())
+        negative_checks.append(CheckProperQuantization())
     return chain(
         any_chain(
             BaseTOSASupportList(),
@@ -83,14 +98,18 @@ def tosa_support_factory(
                 for check in get_registered_tosa_support_checks(tosa_spec)
             ),
         ),
+        *negative_checks,
         *additional_checks if additional_checks else [],
     )
 
 
 class BaseTOSASupportList(OperatorSupportBase):
 
-    def is_node_supported(self, submodules, node: fx.Node) -> bool:
+    def is_node_supported(
+        self, submodules: typing.Mapping[str, torch.nn.Module], node: fx.Node
+    ) -> bool:
         supported = node.op == "call_function" and node.target in [
+            exir_ops.edge.aten.abs.default,
             exir_ops.edge.aten.add.Tensor,
             exir_ops.edge.aten.expand_copy.default,
             exir_ops.edge.aten.cat.default,
@@ -106,6 +125,7 @@ def is_node_supported(self, submodules, node: fx.Node) -> bool:
             exir_ops.edge.aten.log.default,
             exir_ops.edge.aten.linear.default,
             exir_ops.edge.aten.split_with_sizes_copy.default,
+            exir_ops.edge.aten.floor.default,
             exir_ops.edge.aten.full.default,
             exir_ops.edge.aten.full_like.default,
             exir_ops.edge.aten.ge.Tensor,
@@ -148,3 +168,154 @@ def is_node_supported(self, submodules, node: fx.Node) -> bool:
         ]
 
         return supported
+
+
+class NeedsDecompositionCheck(OperatorSupportBase):
+    """
+    Targeted operators need to be decomposed prior to quantization in order to get a pair of q-dq-nodes surrounding
+    the operator, and to get optimal quantization parameters for each operator. This check will reject operators
+    that need to be decomposed.
+    """
+
+    def is_node_supported(
+        self, submodules: typing.Mapping[str, torch.nn.Module], node: fx.Node
+    ) -> bool:
+
+        if node.op != "call_function":
+            return True
+        if node.target == exir_ops.edge.aten.mean.dim:
+            dim = node.args[1]
+            return dim == [-1, -2]
+        needs_decomp = node.target in [
+            exir_ops.edge.aten.div.Tensor,
+            exir_ops.edge.aten._native_batch_norm_legit_no_training.default,
+            exir_ops.edge.aten.native_layer_norm.default,
+            exir_ops.edge.aten.mean.dim,
+            exir_ops.edge.aten._softmax.default,
+            exir_ops.edge.aten._log_softmax.default,
+            exir_ops.edge.aten.var.correction,
+            exir_ops.edge.aten.var.dim,
+        ]
+        return not needs_decomp
+
+
+class CheckProperQuantization(OperatorSupportBase):
+    """
+    For targeted nodes, check that it has been quantized as expected. In most cases this means that a pair of quantize
+    and dequantize nodes surrounds the node. This is neccessary for table operators and operators that need to rescale
+    activations.
+    """
+
+    dq_op = exir_ops.edge.quantized_decomposed.dequantize_per_tensor.default
+    q_op = exir_ops.edge.quantized_decomposed.quantize_per_tensor.default
+
+    def _is_matmul_node_supported(
+        self, submodules: typing.Mapping[str, torch.nn.Module], node: fx.Node
+    ):
+        """
+        Find the matmul source partition containing this node and check that all its inputs and outputs are quantized.
+        """
+        for graph_module in submodules.values():
+            graph_module = typing.cast(fx.GraphModule, graph_module)
+            matmul_partitions = get_source_partitions(
+                graph_module.graph,
+                [
+                    torch.matmul,
+                ],
+                None,
+            )
+            matmul_partitions = list(
+                itertools.chain.from_iterable(matmul_partitions.values())
+            )
+            matched_partition = None
+            for partition in matmul_partitions:
+                if node in partition.nodes:
+                    matched_partition = partition
+            if matched_partition is not None:
+                input_quantized = all(
+                    input_node.target == self.dq_op
+                    for input_node in matched_partition.input_nodes
+                )
+                if not input_quantized:
+                    return False
+                output_quantized = all(
+                    output_node_user.target == self.q_op
+                    for output_node_user in matched_partition.output_nodes[0].users
+                )
+                if not output_quantized:
+                    return False
+            else:
+                return False
+
+        return True
+
+    def is_node_supported(
+        self, submodules: typing.Mapping[str, torch.nn.Module], node: fx.Node
+    ) -> bool:
+        output_quantized = False
+        input_quantized = False
+        if node.target not in (
+            exir_ops.edge.aten.add.Tensor,
+            exir_ops.edge.aten.avg_pool2d.default,
+            exir_ops.edge.aten.bmm.default,
+            exir_ops.edge.aten.convolution.default,
+            exir_ops.edge.aten.exp.default,
+            exir_ops.edge.aten.hardtanh.default,
+            exir_ops.edge.aten.linear.default,
+            exir_ops.edge.aten.log.default,
+            exir_ops.edge.aten.max_pool2d_with_indices.default,
+            exir_ops.edge.aten.mm.default,
+            exir_ops.edge.aten.mul.Tensor,
+            exir_ops.edge.aten.reciprocal.default,
+            exir_ops.edge.aten.relu.default,
+            exir_ops.edge.aten.rsqrt.default,
+            exir_ops.edge.aten.sigmoid.default,
+            exir_ops.edge.aten.sub.Tensor,
+            exir_ops.edge.aten.tanh.default,
+            exir_ops.edge.aten.upsample_nearest2d.vec,
+        ):
+            return True
+        elif node.target in (
+            exir_ops.edge.aten.bmm.default,
+            exir_ops.edge.aten.mm.default,
+        ):
+            source_fn_stack: tuple[typing.Any] = node.meta.get("source_fn_stack", [])
+            if len(source_fn_stack) > 0:
+                if source_fn_stack[-1][1] in (torch.matmul,):
+                    return self._is_matmul_node_supported(submodules, node)
+
+        elif node.target in (exir_ops.edge.aten.max_pool2d_with_indices.default,):
+            users = node.users
+            output_quantized = all(
+                user.target == operator.getitem
+                and all(user_user.target == self.q_op for user_user in user.users)
+                for user in users
+            )
+        elif FuseQuantizedActivationPass._is_fuseable_input(node):
+            users = node.users
+            output_quantized = all(
+                FuseQuantizedActivationPass._is_fuseable_quantized_activation(user)
+                for user in users
+            )
+        elif FuseQuantizedActivationPass._is_fuseable_quantized_activation(node):
+            input_node = node.all_input_nodes[0]
+            input_quantized = FuseQuantizedActivationPass._is_fuseable_input(input_node)
+
+        input_quantized = input_quantized or all(
+            (input_node.target == self.dq_op)
+            or (not get_first_fake_tensor(input_node).dtype.is_floating_point)
+            for input_node in node.all_input_nodes
+        )
+
+        if not input_quantized:
+            return False
+
+        output_quantized = output_quantized or all(
+            (output_node.target == self.q_op)
+            or (not get_first_fake_tensor(output_node).dtype.is_floating_point)
+            for output_node in node.users
+        )
+
+        if not output_quantized:
+            return False
+        return True
diff --git a/backends/arm/operators/TARGETS b/backends/arm/operators/TARGETS
index 1f91aa37b7..cb08adb035 100644
--- a/backends/arm/operators/TARGETS
+++ b/backends/arm/operators/TARGETS
@@ -13,7 +13,7 @@ python_library(
 
 python_library(
     name = "ops",
-    srcs = glob(["op_*.py"]),
+    srcs = glob(["op_*.py", "ops_*.py"]),
     typing = True,
     deps = [
         "fbsource//third-party/serialization_lib/python/tosa:tosa",
diff --git a/backends/arm/operators/__init__.py b/backends/arm/operators/__init__.py
index 735debe367..e98d7e7693 100644
--- a/backends/arm/operators/__init__.py
+++ b/backends/arm/operators/__init__.py
@@ -7,6 +7,7 @@
 
 from . import (  # noqa
     node_visitor,
+    op_abs,
     op_add,
     op_avg_pool2d,
     op_bmm,
@@ -20,7 +21,6 @@
     op_ge,
     op_get_item,
     op_gt,
-    op_hardtanh,
     op_le,
     op_log,
     op_lt,
@@ -30,7 +30,6 @@
     op_mul,
     op_permute,
     op_reciprocal,
-    op_relu,
     op_repeat,
     op_rescale,
     op_rshift,
@@ -47,4 +46,5 @@
     op_upsample_nearest2d,
     op_view,
     ops_binary,
+    ops_unary,
 )
diff --git a/backends/arm/operators/op_abs.py b/backends/arm/operators/op_abs.py
new file mode 100644
index 0000000000..886a96fd52
--- /dev/null
+++ b/backends/arm/operators/op_abs.py
@@ -0,0 +1,133 @@
+# Copyright 2025 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# pyre-unsafe
+from typing import List
+
+import executorch.backends.arm.tosa_quant_utils as tqutils
+import executorch.backends.arm.tosa_utils as tutils
+
+import serializer.tosa_serializer as ts  # type: ignore
+from executorch.backends.arm.operators.node_visitor import (
+    NodeVisitor,
+    register_node_visitor,
+)
+from executorch.backends.arm.tosa_mapping import TosaArg
+from executorch.backends.arm.tosa_specification import TosaSpecification
+
+from serializer.tosa_serializer import TosaOp
+from torch.fx import Node
+
+
+@register_node_visitor
+class AbsVisitor_080_BI(NodeVisitor):
+    target = "aten.abs.default"
+
+    tosa_specs = [
+        TosaSpecification.create_from_string("TOSA-0.80+BI"),
+    ]
+
+    def __init__(self, *args):
+        super().__init__(*args)
+
+    def define_node(
+        self,
+        node: Node,
+        tosa_graph: ts.TosaSerializer,
+        inputs: List[TosaArg],
+        output: TosaArg,
+    ) -> None:
+        # Specification (0.80) states that input and output types
+        # should all be the same
+        if not (inputs[0].dtype == output.dtype):
+            raise ValueError(
+                "All inputs and outputs need same dtype."
+                f"Got {inputs[0].dtype=}, {output.dtype=}"
+            )
+        # Handle int8 (quantized) and int32
+        if not (inputs[0].dtype in [ts.DType.INT8, ts.DType.INT32]):
+            raise ValueError(
+                "All inputs need to be INT8 or INT32." f"Got {inputs[0].dtype=}"
+            )
+
+        if inputs[0].dtype == ts.DType.INT8:
+            rescaled_inputs, scale_back = tqutils.insert_rescale_ops_to_int32(
+                tosa_graph, inputs, node
+            )
+        else:
+            # input[0].dtype == ts.DType.INT32
+            # Non quantized input, natively support by TOSA.abs
+            rescaled_inputs = inputs
+
+        if output.dtype == ts.DType.INT8:
+            broadcasted_shape = tutils.tosa_shape(output.shape, output.dim_order)
+            abs_output = tosa_graph.addIntermediate(broadcasted_shape, ts.DType.INT32)
+        else:
+            # output.dtype == ts.DType.INT32
+            abs_output = output
+
+        # Do the INT32 Abs
+        tosa_graph.addOperator(
+            TosaOp.Op().ABS,
+            [
+                rescaled_inputs[0].name,
+            ],
+            [abs_output.name],
+            None,
+        )
+
+        if output.dtype == ts.DType.INT8:
+            # Scale output back to 8 bit
+            # pyre-ignore
+            tqutils.insert_rescale_op_to_int8(tosa_graph, abs_output, scale_back, node)  # type: ignore[possibly-undefined]
+
+
+@register_node_visitor
+class AbsVisitor_080_MI(AbsVisitor_080_BI):
+    # inheriting 'target' from BI class
+
+    tosa_specs = [
+        TosaSpecification.create_from_string("TOSA-0.80+MI"),
+    ]
+
+    def __init__(self, *args):
+        super().__init__(*args)
+
+    def define_node(
+        self,
+        node: Node,
+        tosa_graph: ts.TosaSerializer,
+        inputs: List[TosaArg],
+        output: TosaArg,
+    ) -> None:
+        # Specification (0.80) states that input and output types
+        # should all be the same
+        if not (inputs[0].dtype == output.dtype):
+            raise ValueError(
+                "All inputs and output need same dtype."
+                f"Got {inputs[0].dtype=}, {output.dtype=}"
+            )
+
+        if inputs[0].dtype in [ts.DType.INT8, ts.DType.INT32]:
+            # Call the inherited define_node for handling integers
+            super().define_node(node, tosa_graph, inputs, output)
+        else:
+            # FP32 Abs lowering
+
+            if not (inputs[0].dtype == ts.DType.FP32):
+                raise ValueError(
+                    "All inputs need to be FP32." f"Got {inputs[0].dtype=}"
+                )
+
+            if not (output.dtype == ts.DType.FP32):
+                raise ValueError("All outputs need to be FP32." f"Got {output.dtype=}")
+
+            # MI lowering
+            tosa_graph.addOperator(
+                TosaOp.Op().ABS,
+                [inputs[0].name],
+                [output.name],
+                None,
+            )
diff --git a/backends/arm/operators/op_hardtanh.py b/backends/arm/operators/op_hardtanh.py
deleted file mode 100644
index fc0ee552a9..0000000000
--- a/backends/arm/operators/op_hardtanh.py
+++ /dev/null
@@ -1,66 +0,0 @@
-# Copyright 2023-2025 Arm Limited and/or its affiliates.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-# pyre-unsafe
-from typing import List
-
-import serializer.tosa_serializer as ts  # type: ignore
-import torch
-
-# pyre-fixme[21]: 'Could not find a module corresponding to import `executorch.backends.arm._passes.fold_qdq_with_annotated_qparams_pass`.'
-from executorch.backends.arm._passes.fold_qdq_with_annotated_qparams_pass import (
-    get_input_qparams,
-)
-from executorch.backends.arm.operators.node_visitor import (
-    NodeVisitor,
-    register_node_visitor,
-)
-from executorch.backends.arm.tosa_mapping import TosaArg
-
-from serializer.tosa_serializer import TosaOp
-
-
-@register_node_visitor
-class HardTanhVisitor(NodeVisitor):
-    target = "aten.hardtanh.default"
-
-    def __init__(self, *args):
-        super().__init__(*args)
-
-    def define_node(
-        self,
-        node: torch.fx.Node,
-        tosa_graph: ts.TosaSerializer,
-        inputs: List[TosaArg],
-        output: TosaArg,
-    ) -> None:
-        attr = ts.TosaSerializerAttribute()
-
-        if inputs[0].dtype == ts.DType.INT8:
-            # Get quant parameters
-            input_qparams = get_input_qparams(node)  # pyre-ignore[16]
-            qargs = input_qparams[0]
-            # Convert to quantized representation
-            clamp_min_qs = qargs.quantize_value(inputs[1].number).item()
-            clamp_max_qs = qargs.quantize_value(inputs[2].number).item()
-            # Set fp values to 0.0 since they are not used
-            clamp_min_fp = 0.0
-            clamp_max_fp = 0.0
-        else:
-            clamp_min_fp = inputs[1].number
-            clamp_max_fp = inputs[2].number
-            # Set qs values to 0 since they are not used
-            clamp_min_qs = 0
-            clamp_max_qs = 0
-
-        attr.ClampAttribute(
-            tosa_graph.builder,
-            clamp_min_qs,
-            clamp_max_qs,
-            clamp_min_fp,
-            clamp_max_fp,
-        )
-
-        tosa_graph.addOperator(TosaOp.Op().CLAMP, [inputs[0].name], [output.name], attr)
diff --git a/backends/arm/operators/op_relu.py b/backends/arm/operators/op_relu.py
deleted file mode 100644
index c37e4b3e75..0000000000
--- a/backends/arm/operators/op_relu.py
+++ /dev/null
@@ -1,59 +0,0 @@
-# Copyright 2024-2025 Arm Limited and/or its affiliates.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-# pyre-unsafe
-
-import serializer.tosa_serializer as ts  # type: ignore
-import torch.fx
-
-# pyre-fixme[21]: 'Could not find a module corresponding to import `executorch.backends.arm._passes.fold_qdq_with_annotated_qparams_pass`.'
-from executorch.backends.arm._passes.fold_qdq_with_annotated_qparams_pass import (
-    get_output_qparams,
-)
-from executorch.backends.arm.operators.node_visitor import (
-    NodeVisitor,
-    register_node_visitor,
-)
-from executorch.backends.arm.tosa_mapping import TosaArg
-from serializer.tosa_serializer import TosaOp
-
-
-@register_node_visitor
-class ReluVisitor(NodeVisitor):
-    target = "aten.relu.default"
-
-    def __init__(self, *args):
-        super().__init__(*args)
-
-    def define_node(
-        self,
-        node: torch.fx.Node,
-        tosa_graph: ts.TosaSerializer,
-        inputs: list[TosaArg],
-        output: TosaArg,
-    ) -> None:
-        attr = ts.TosaSerializerAttribute()
-
-        clamp_min_fp = 0.0
-        clamp_max_fp = 0.0
-        clamp_min_qs = 0
-        clamp_max_qs = 0
-        if inputs[0].dtype == ts.DType.INT8:
-            out_qargs = get_output_qparams(node)  # pyre-ignore[16]
-            clamp_min_qs = out_qargs[0].quantize_value(0).item()
-            clamp_max_qs = out_qargs[0].quantize_value(float("inf")).item()
-        else:
-            clamp_min_fp = 0
-            clamp_max_fp = float("inf")
-
-        attr.ClampAttribute(
-            tosa_graph.builder,
-            clamp_min_qs,
-            clamp_max_qs,
-            clamp_min_fp,
-            clamp_max_fp,
-        )
-
-        tosa_graph.addOperator(TosaOp.Op().CLAMP, [inputs[0].name], [output.name], attr)
diff --git a/backends/arm/operators/ops_unary.py b/backends/arm/operators/ops_unary.py
new file mode 100644
index 0000000000..31397b9a3b
--- /dev/null
+++ b/backends/arm/operators/ops_unary.py
@@ -0,0 +1,57 @@
+# Copyright 2025 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# pyre-unsafe
+from typing import List
+
+import serializer.tosa_serializer as ts  # type: ignore
+import torch.fx
+from executorch.backends.arm.operators.node_visitor import (
+    NodeVisitor,
+    register_node_visitor,
+)
+
+from executorch.backends.arm.tosa_mapping import TosaArg
+from executorch.backends.arm.tosa_specification import TosaSpecification
+from serializer.tosa_serializer import TosaOp
+
+
+def unary_operator_factory(unary_target: str, tosa_op):
+    "Creates and registers NodeVisitors for operations that have one input and map directly into a TOSA op."
+
+    class UnaryOperator_080_MI(NodeVisitor):
+        target = unary_target
+
+        tosa_specs = [TosaSpecification.create_from_string("TOSA-0.80+MI")]
+
+        def __init__(self, *args):
+            super().__init__(*args)
+
+        def define_node(
+            self,
+            node: torch.fx.Node,
+            tosa_graph: ts.TosaSerializer,
+            inputs: List[TosaArg],
+            output: TosaArg,
+        ) -> None:
+
+            if not (inputs[0].dtype == output.dtype):
+                raise ValueError(
+                    "All inputs and output need same dtype."
+                    f"Got {inputs[0].dtype=}, {output.dtype=}"
+                )
+
+            if not (inputs[0].dtype == ts.DType.FP32):
+                raise ValueError(
+                    "All inputs need to be FP32." f"Got {inputs[0].dtype=}"
+                )
+
+            # MI lowering
+            tosa_graph.addOperator(tosa_op, [inputs[0].name], [output.name])
+
+    register_node_visitor(UnaryOperator_080_MI)
+
+
+unary_operator_factory("aten.floor.default", TosaOp.Op().FLOOR)
diff --git a/backends/arm/quantizer/quantization_annotator.py b/backends/arm/quantizer/quantization_annotator.py
index f1cef97178..09eb3e2a12 100644
--- a/backends/arm/quantizer/quantization_annotator.py
+++ b/backends/arm/quantizer/quantization_annotator.py
@@ -125,7 +125,9 @@ def _match_pattern(
 
 
 _one_to_one = [
+    torch.ops.aten.abs.default,
     torch.ops.aten.exp.default,
+    torch.ops.aten.floor.default,
     torch.ops.aten.log.default,
     torch.ops.aten.reciprocal.default,
     torch.ops.aten.rsqrt.default,
@@ -181,6 +183,7 @@ def _match_pattern(
     torch.ops.aten.hardtanh.default,
     torch.ops.aten.hardtanh_.default,
     torch.ops.aten.relu.default,
+    torch.ops.aten.relu_.default,
     torch.ops.aten.mean.default,
     torch.ops.aten.mean.dim,
     torch.ops.aten.permute.default,
diff --git a/backends/arm/scripts/build_executorch.sh b/backends/arm/scripts/build_executorch.sh
new file mode 100755
index 0000000000..f868d264f4
--- /dev/null
+++ b/backends/arm/scripts/build_executorch.sh
@@ -0,0 +1,123 @@
+#!/usr/bin/env bash
+# Copyright 2025 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# Optional parameter:
+# --build_type= "Release" | "Debug" | "RelWithDebInfo"
+# --etdump      build with devtools-etdump support
+
+set -eu
+
+script_dir=$(cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd)
+et_root_dir=$(cd ${script_dir}/../../.. && pwd)
+et_root_dir=$(realpath ${et_root_dir})
+toolchain_cmake=${script_dir}/../../../examples/arm/ethos-u-setup/arm-none-eabi-gcc.cmake
+toolchain_cmake=$(realpath ${toolchain_cmake})
+
+
+
+et_build_root="${et_root_dir}/arm_test"
+build_type="Release"
+build_with_etdump=false
+
+
+help() {
+    echo "Usage: $(basename $0) [options]"
+    echo "Options:"
+    echo "  --et_build_root=<FOLDER>  Build output root folder to use, defaults to ${et_build_root}"
+    echo "  --build_type=<TYPE>       Build with Release, Debug or RelWithDebInfo, default is ${build_type}"
+    echo "  --etdump                  Adds Devtools etdump support to track timing, etdump area will be base64 encoded in the log"
+    exit 0
+}
+
+for arg in "$@"; do
+    case $arg in
+      -h|--help) help ;;
+      --et_build_root=*) et_build_root="${arg#*=}";;
+      --build_type=*) build_type="${arg#*=}";;
+      --etdump) build_with_etdump=true ;;
+      *)
+      ;;
+    esac
+done
+
+et_build_dir="${et_build_root}/cmake-out"
+et_build_host_dir=${et_build_root}/cmake-out-host-tools
+
+set -x
+cd "${et_root_dir}"
+
+build_with_etdump_flags=""
+if [ "$build_with_etdump" = true ] ; then
+    ( set +x ;
+        echo "--------------------------------------------------------------------------------" ;
+        echo "Build ExecuTorch Libraries host flatcc bin ${build_type} into ${et_build_host_dir} - ${et_build_host_dir}/bin/flatcc" ;
+        echo "--------------------------------------------------------------------------------" )
+
+
+    # Build host flatcc bin
+    # This is a way to work around that the flatcc executable get build for target (e.g. Arm) later
+    # and get replaced. flatcc is a tool used on the host for etdump and BundleIO handling.
+    # The way to solve this is to generate it once for the host, then copy it to ${et_build_host_dir}/bin
+    # and later point that out with -DFLATCC_EXECUTABLE=${et_build_host_dir}/bin/flatcc later.
+    mkdir -p ${et_build_host_dir}
+    cmake                                                 \
+        -DCMAKE_INSTALL_PREFIX=${et_build_host_dir}       \
+        -DCMAKE_BUILD_TYPE=${build_type}                  \
+        -DEXECUTORCH_BUILD_EXECUTOR_RUNNER=OFF            \
+        -DEXECUTORCH_ENABLE_LOGGING=ON                    \
+        -DEXECUTORCH_BUILD_ARM_BAREMETAL=ON               \
+        -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON           \
+        -DEXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL=ON       \
+        -DEXECUTORCH_BUILD_DEVTOOLS=ON                    \
+        -DEXECUTORCH_ENABLE_EVENT_TRACER=ON               \
+        -DEXECUTORCH_SEPARATE_FLATCC_HOST_PROJECT=ON      \
+        -DFLATCC_ALLOW_WERROR=OFF                         \
+        -DFLATC_EXECUTABLE="$(which flatc)"               \
+        -B"${et_build_host_dir}"                          \
+        "${et_root_dir}"
+
+    # Copy host flatcc excutable to it's saved when we build for target (Arm) later
+    mkdir -p ${et_build_host_dir}/bin
+    cp third-party/flatcc/bin/flatcc ${et_build_host_dir}/bin
+
+    # Add DevTools flags use in the Target build below
+    build_with_etdump_flags="-DEXECUTORCH_BUILD_DEVTOOLS=ON                    \
+                                -DEXECUTORCH_ENABLE_EVENT_TRACER=ON               \
+                                -DEXECUTORCH_SEPARATE_FLATCC_HOST_PROJECT=OFF     \
+                                -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=OFF      \
+                                -DFLATCC_ALLOW_WERROR=OFF                         \
+                                -DFLATCC_EXECUTABLE=${et_build_host_dir}/bin/flatcc "
+    echo "build_with_etdump_flags=$build_with_etdump_flags"
+fi
+
+( set +x ;
+    echo "--------------------------------------------------------------------------------" ;
+    echo "Build ExecuTorch target libs ${build_type} into '${et_build_dir}'" ;
+    echo "--------------------------------------------------------------------------------" )
+
+# Build
+cmake                                                 \
+    -DCMAKE_INSTALL_PREFIX=${et_build_dir}            \
+    -DCMAKE_BUILD_TYPE=${build_type}                  \
+    -DCMAKE_TOOLCHAIN_FILE="${toolchain_cmake}"       \
+    -DEXECUTORCH_BUILD_EXECUTOR_RUNNER=OFF            \
+    -DEXECUTORCH_BUILD_ARM_BAREMETAL=ON               \
+    -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON           \
+    -DEXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL=ON       \
+    -DEXECUTORCH_ENABLE_LOGGING=ON                    \
+    ${build_with_etdump_flags}                        \
+    -DFLATC_EXECUTABLE="$(which flatc)"               \
+    -B"${et_build_dir}"                               \
+    "${et_root_dir}"
+
+echo "[$(basename $0)] Configured CMAKE"
+
+cmake --build ${et_build_dir} --parallel --target install --config ${build_type} --
+
+set +x
+
+echo "[$(basename $0)] Generated static libraries for ExecuTorch:"
+find ${et_build_dir} -name "*.a" -exec ls -al {} \;
diff --git a/backends/arm/scripts/build_executorch_runner.sh b/backends/arm/scripts/build_executorch_runner.sh
new file mode 100755
index 0000000000..afa8f27bdf
--- /dev/null
+++ b/backends/arm/scripts/build_executorch_runner.sh
@@ -0,0 +1,125 @@
+#!/usr/bin/env bash
+# Copyright 2025 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+set -eu
+
+script_dir=$(cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd)
+et_root_dir=$(cd ${script_dir}/../../.. && pwd)
+et_root_dir=$(realpath ${et_root_dir})
+toolchain_cmake=${et_root_dir}/examples/arm/ethos-u-setup/arm-none-eabi-gcc.cmake
+
+pte_file=""
+target="ethos-u55-128"
+build_type="Release"
+system_config=""
+build_with_etdump=false
+extra_build_flags=""
+output_folder_set=false
+output_folder="."
+et_build_root="${et_root_dir}/arm_test"
+ethosu_tools_dir=${et_root_dir}/examples/arm/ethos-u-scratch
+
+help() {
+    echo "Usage: $(basename $0) [options]"
+    echo "Options:"
+    echo "  --pte=<PTE_FILE>                pte file (genrated by the aot_arm_compier from the model to include in the elf"
+    echo "  --target=<TARGET>               Target to build and run for Default: ${target}"
+    echo "  --build_type=<TYPE>             Build with Release, Debug or RelWithDebInfo, default is ${build_type}"
+    echo "  --system_config=<CONFIG>        System configuration to select from the Vela configuration file (see vela.ini). Default: Ethos_U55_High_End_Embedded for EthosU55 targets, Ethos_U85_SYS_DRAM_Mid for EthosU85 targets."
+    echo "                                     NOTE: If given, this option must match the given target. This option also sets timing adapter values customized for specific hardware, see ./executor_runner/CMakeLists.txt."
+    echo "  --etdump                        Adds Devtools etdump support to track timing, etdump area will be base64 encoded in the log"
+    echo "  --extra_build_flags=<FLAGS>     Extra flags to pass to cmake like -DET_ARM_BAREMETAL_METHOD_ALLOCATOR_POOL_SIZE=60000 Default: none "
+    echo "  --output=<FOLDER>               Output folder Default: <MODEL>/<MODEL>_<TARGET INFO>.pte"
+    echo "  --et_build_root=<FOLDER>        Build output root folder to use, defaults to ${et_build_root}"
+    echo "  --ethosu_tools_dir=<FOLDER>     Path to your Ethos-U tools dir if you not using default: ${ethosu_tools_dir}"
+    exit 0
+}
+
+for arg in "$@"; do
+    case $arg in
+      -h|--help) help ;;
+      --pte=*) pte_file="${arg#*=}";;
+      --target=*) target="${arg#*=}";;
+      --build_type=*) build_type="${arg#*=}";;
+      --system_config=*) system_config="${arg#*=}";;
+      --etdump) build_with_etdump=true ;;
+      --extra_build_flags=*) extra_build_flags="${arg#*=}";;
+      --output=*) output_folder="${arg#*=}" ; output_folder_set=true ;;
+      --et_build_root=*) et_build_root="${arg#*=}";;
+      --ethosu_tools_dir=*) ethosu_tools_dir="${arg#*=}";;
+      *)
+      ;;
+    esac
+done
+
+pte_file=$(realpath ${pte_file})
+ethosu_tools_dir=$(realpath ${ethosu_tools_dir})
+ethos_u_root_dir="$ethosu_tools_dir/ethos-u"
+ethosu_tools_dir=$(realpath ${ethos_u_root_dir})
+
+et_build_dir=${et_build_root}/cmake-out
+et_build_dir=$(realpath ${et_build_dir})
+
+if [ "$output_folder_set" = false ] ; then
+    pte_folder=$(cd -- "$( dirname -- "${pte_file}" )" &> /dev/null && pwd)
+    pte_short_name=$(basename -- "${pte_file}" ".pte")
+    output_folder="$pte_folder/$pte_short_name"
+fi
+
+if [[ ${system_config} == "" ]]
+then
+    system_config="Ethos_U55_High_End_Embedded"
+    if [[ ${target} =~ "ethos-u85" ]]
+    then
+        system_config="Ethos_U85_SYS_DRAM_Mid"
+    fi
+fi
+
+output_folder=$(realpath ${output_folder})
+
+if [[ ${target} == *"ethos-u55"*  ]]; then
+    target_cpu=cortex-m55
+else
+    target_cpu=cortex-m85
+fi
+echo "--------------------------------------------------------------------------------"
+echo "Build Arm Baremetal executor_runner for ${target} with ${pte_file} using ${system_config} to '${output_folder}/cmake-out'"
+echo "--------------------------------------------------------------------------------"
+
+cd ${et_root_dir}/examples/arm/executor_runner
+
+build_with_etdump_flags=""
+if [ "$build_with_etdump" = true ] ; then
+    echo "Building with etdump e.g. -DEXECUTORCH_ENABLE_EVENT_TRACER=ON"
+    build_with_etdump_flags=" -DEXECUTORCH_ENABLE_EVENT_TRACER=ON "
+fi
+
+mkdir -p "$output_folder"
+
+cmake \
+    -DCMAKE_BUILD_TYPE=${build_type}            \
+    -DCMAKE_TOOLCHAIN_FILE=${toolchain_cmake}   \
+    -DTARGET_CPU=${target_cpu}                  \
+    -DET_DIR_PATH:PATH=${et_root_dir}           \
+    -DET_BUILD_DIR_PATH:PATH=${et_build_dir}    \
+    -DET_PTE_FILE_PATH:PATH="${pte_file}"            \
+    -DETHOS_SDK_PATH:PATH=${ethos_u_root_dir}   \
+    -DETHOSU_TARGET_NPU_CONFIG=${target}        \
+    ${build_with_etdump_flags}                  \
+    -DPYTHON_EXECUTABLE=$(which python3)        \
+    -DSYSTEM_CONFIG=${system_config}            \
+    ${extra_build_flags}                        \
+    -B ${output_folder}/cmake-out
+
+echo "[${BASH_SOURCE[0]}] Configured CMAKE"
+
+cmake --build ${output_folder}/cmake-out --parallel -- arm_executor_runner
+
+echo "[${BASH_SOURCE[0]}] Generated baremetal elf file:"
+find ${output_folder}/cmake-out -name "arm_executor_runner"
+echo "executable_text: $(find ${output_folder}/cmake-out -name arm_executor_runner -exec arm-none-eabi-size {} \; | grep -v filename | awk '{print $1}') bytes"
+echo "executable_data: $(find ${output_folder}/cmake-out -name arm_executor_runner -exec arm-none-eabi-size {} \; | grep -v filename | awk '{print $2}') bytes"
+echo "executable_bss:  $(find ${output_folder}/cmake-out -name arm_executor_runner -exec arm-none-eabi-size {} \; | grep -v filename | awk '{print $3}') bytes"
diff --git a/backends/arm/scripts/build_portable_kernels.sh b/backends/arm/scripts/build_portable_kernels.sh
new file mode 100755
index 0000000000..afdccd79cf
--- /dev/null
+++ b/backends/arm/scripts/build_portable_kernels.sh
@@ -0,0 +1,74 @@
+#!/usr/bin/env bash
+# Copyright 2025 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# Optional parameter:
+# --build_type= "Release" | "Debug" | "RelWithDebInfo"
+# --etdump      build with devtools-etdump support
+
+set -eu
+
+script_dir=$(cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd)
+et_root_dir=$(cd ${script_dir}/../../.. && pwd)
+et_root_dir=$(realpath ${et_root_dir})
+toolchain_cmake=${script_dir}/../../../examples/arm/ethos-u-setup/arm-none-eabi-gcc.cmake
+toolchain_cmake=$(realpath ${toolchain_cmake})
+
+
+et_build_root="${et_root_dir}/arm_test"
+build_type="Release"
+portable_kernels="aten::_softmax.out"
+
+help() {
+    echo "Usage: $(basename $0) [options]"
+    echo "Options:"
+    echo "  --et_build_root=<FOLDER>   Build output root folder to use, defaults to ${et_build_root}"
+    echo "  --build_type=<TYPE>        Build with Release, Debug or RelWithDebInfo, default is ${build_type}"
+    echo "  --portable_kernels=<OPS>   Comma separated list of portable (non delagated) kernels to include Default: ${portable_kernels}"
+    exit 0
+}
+
+for arg in "$@"; do
+    case $arg in
+      -h|--help) help ;;
+      --et_build_root=*) et_build_root="${arg#*=}";;
+      --build_type=*) build_type="${arg#*=}";;
+      --portable_kernels=*) portable_kernels="${arg#*=}";;
+      *)
+      ;;
+    esac
+done
+
+et_build_dir=${et_build_root}/cmake-out
+
+cd "${et_root_dir}"
+
+echo "--------------------------------------------------------------------------------" ;
+echo "Build ExecuTorch Libraries ${build_type} portable kernels: ${portable_kernels} into '${et_build_dir}'" ;
+echo "--------------------------------------------------------------------------------"
+
+if ! [[ $portable_kernels =~ ^((^|,)aten::[a-zA-Z0-9_]+\.[a-zA-Z0-9_]*out)*$ ]]; then
+    echo " ERROR: specified argument --portable_kernels=${portable_kernels}"
+    echo "        is in the wrong format please use \"aten::<OP1>.out,aten::<OP2>.out,...\""
+    echo "        e.g. \"aten::_softmax.out,aten::add.out\""
+    exit 1
+fi
+
+set -x
+
+cmake                                                 \
+    -DCMAKE_INSTALL_PREFIX=${et_build_dir}            \
+    -DCMAKE_BUILD_TYPE=${build_type}                  \
+    -DCMAKE_TOOLCHAIN_FILE="${toolchain_cmake}"       \
+    -DEXECUTORCH_SELECT_OPS_LIST=${portable_kernels}  \
+    -B"${et_build_dir}/examples/arm"                  \
+    "${et_root_dir}/examples/arm"
+
+cmake --build "${et_build_dir}/examples/arm" --parallel --config ${build_type} --
+
+set +x
+
+echo "[$(basename $0)] Generated static libraries for ExecuTorch:"
+find "${et_build_dir}/examples/arm" -name "*.a" -exec ls -al {} \;
diff --git a/backends/arm/scripts/build_quantized_ops_aot_lib.sh b/backends/arm/scripts/build_quantized_ops_aot_lib.sh
index 3c70b48a5d..ad6fad9c12 100755
--- a/backends/arm/scripts/build_quantized_ops_aot_lib.sh
+++ b/backends/arm/scripts/build_quantized_ops_aot_lib.sh
@@ -4,26 +4,51 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-# Needs to be run from exeuctorch root.
 # Optional parameter: 1: build_type= "Release" | "Debug" | "RelWithDebInfo"
 
+set -eu
+script_dir=$(cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd)
+et_root_dir=$(cd ${script_dir}/../../.. && pwd)
+et_root_dir=$(realpath ${et_root_dir})
+
 build_type="Release"
+et_build_root="${et_root_dir}"
+
+help() {
+    echo "Usage: $(basename $0) [options]"
+    echo "Options:"
+    echo "  --et_build_root=<FOLDER>  Build output root folder to use, defaults to ${et_build_root}"
+    echo "  --build_type=<TYPE>       Build with Release, Debug or RelWithDebInfo, default is ${build_type}"
+    exit 0
+}
+
+for arg in "$@"; do
+    case $arg in
+      -h|--help) help ;;
+      --et_build_root=*) et_build_root="${arg#*=}";;
+      --build_type=*) build_type="${arg#*=}";;
+      *)
+      ;;
+    esac
+done
+
+et_build_dir=${et_build_root}/cmake-out-aot-lib
 
-build_type=${1:-$build_type}
+cd "${et_root_dir}"
 
 echo "--------------------------------------------------------------------------------"
-echo "Build .so library to register quant ops with AoT flow ${build_type} into '$(echo $(pwd))/cmake-out-aot-lib'"
+echo "Build quantized_ops_aot_lib library to register quant ops with AoT flow ${build_type} into '${et_build_dir}'"
 echo "--------------------------------------------------------------------------------"
 
 # Since we only want to build the quantized_aot lib in the specified folder,
 # we want exactly the configuration set below and deleting the cache is OK.
-rm -f cmake-out-aot-lib/CMakeCache.txt
+rm -f ${et_build_dir}/CMakeCache.txt
 
 CXXFLAGS="-fno-exceptions -fno-rtti" cmake \
     -DCMAKE_BUILD_TYPE=${build_type}            \
     -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON      \
     -DEXECUTORCH_BUILD_KERNELS_QUANTIZED_AOT=ON \
-    -Bcmake-out-aot-lib                         \
+    -B${et_build_dir}                         \
     .
 
-cmake --build cmake-out-aot-lib --parallel -- quantized_ops_aot_lib
+cmake --build ${et_build_dir} --parallel -- quantized_ops_aot_lib
diff --git a/backends/arm/scripts/run_fvp.sh b/backends/arm/scripts/run_fvp.sh
new file mode 100755
index 0000000000..568f07011f
--- /dev/null
+++ b/backends/arm/scripts/run_fvp.sh
@@ -0,0 +1,104 @@
+#!/usr/bin/env bash
+# Copyright 2025 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# Optional parameter:
+# --build_type= "Release" | "Debug" | "RelWithDebInfo"
+# --etdump      build with devtools-etdump support
+
+set -eu
+
+script_dir=$(cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd)
+et_root_dir=$(cd ${script_dir}/../../.. && pwd)
+et_root_dir=$(realpath ${et_root_dir})
+setup_path_script=${et_root_dir}/examples/arm/ethos-u-scratch/setup_path.sh
+_setup_msg="please refer to ${et_root_dir}/examples/arm/setup.sh to properly install necessary tools."
+
+
+elf_file=""
+target="ethos-u55-128"
+
+help() {
+    echo "Usage: $(basename $0) [options]"
+    echo "Options:"
+    echo "  --elf=<ELF_FILE>         elf file to run"
+    echo "  --target=<TARGET>        Target to build and run for Default: ${target}"
+    exit 0
+}
+
+for arg in "$@"; do
+    case $arg in
+      -h|--help) help ;;
+      --elf=*) elf_file="${arg#*=}";;
+      --target=*) target="${arg#*=}";;
+      *)
+      ;;
+    esac
+done
+
+elf_file=$(realpath ${elf_file})
+
+if [[ ${target} == *"ethos-u55"*  ]]; then
+    fvp_model=FVP_Corstone_SSE-300_Ethos-U55
+else
+    fvp_model=FVP_Corstone_SSE-320
+fi
+
+# Source the tools
+# This should be prepared by the setup.sh
+[[ -f ${setup_path_script} ]] \
+    || { echo "Missing ${setup_path_script}. ${_setup_msg}"; exit 1; }
+
+source ${setup_path_script}
+
+# basic checks before we get started
+hash ${fvp_model} \
+    || { echo "Could not find ${fvp_model} on PATH, ${_setup_msg}"; exit 1; }
+
+
+[[ ! -f $elf_file ]] && { echo "[${BASH_SOURCE[0]}]: Unable to find executor_runner elf: ${elf_file}"; exit 1; }
+num_macs=$(echo ${target} | cut -d - -f 3)
+
+echo "--------------------------------------------------------------------------------"
+echo "Running ${elf_file} for ${target} run with FVP:${fvp_model} num_macs:${num_macs}"
+echo "--------------------------------------------------------------------------------"
+
+log_file=$(mktemp)
+
+if [[ ${target} == *"ethos-u55"*  ]]; then
+    ${fvp_model}                                            \
+        -C ethosu.num_macs=${num_macs}                      \
+        -C mps3_board.visualisation.disable-visualisation=1 \
+        -C mps3_board.telnetterminal0.start_telnet=0        \
+        -C mps3_board.uart0.out_file='-'                    \
+        -C mps3_board.uart0.shutdown_on_eot=1               \
+        -a "${elf_file}"                                         \
+        --timelimit 220 2>&1 | tee ${log_file} || true # seconds
+    echo "[${BASH_SOURCE[0]}] Simulation complete, $?"
+elif [[ ${target} == *"ethos-u85"*  ]]; then
+    ${fvp_model}                                            \
+        -C mps4_board.subsystem.ethosu.num_macs=${num_macs} \
+        -C mps4_board.visualisation.disable-visualisation=1 \
+        -C vis_hdlcd.disable_visualisation=1                \
+        -C mps4_board.telnetterminal0.start_telnet=0        \
+        -C mps4_board.uart0.out_file='-'                    \
+        -C mps4_board.uart0.shutdown_on_eot=1               \
+        -a "${elf_file}"                                         \
+        --timelimit 220 2>&1 | tee ${log_file} || true # seconds
+    echo "[${BASH_SOURCE[0]}] Simulation complete, $?"
+else
+    echo "Running ${elf_file} for ${target} is not supported"
+    exit 1
+fi
+
+echo "Checking for problems in log:"
+! grep -E "^(F|E|\\[critical\\]|Hard fault.|Info: Simulation is stopping. Reason: CPU time has been exceeded.).*$" ${log_file}
+if [ $? != 0 ]; then
+    echo "Found ERROR"
+    rm "${log_file}"
+    exit 1
+fi
+echo "No problems found!"
+rm "${log_file}"
diff --git a/backends/arm/test/misc/test_multiple_outputs.py b/backends/arm/test/misc/test_multiple_outputs.py
index ddddc94d27..d3bea9a400 100644
--- a/backends/arm/test/misc/test_multiple_outputs.py
+++ b/backends/arm/test/misc/test_multiple_outputs.py
@@ -76,23 +76,21 @@ def _test_ethosu_BI_pipeline(
             tester.run_method_and_compare_outputs(qtol=1, inputs=test_data)
 
     @pytest.mark.corstone_fvp
-    def test_u85_BI(self):
+    def test_u55_BI(self):
         module = self.MultipleOutputsModule()
         test_data = module.get_inputs()
         self._test_ethosu_BI_pipeline(
             module,
             test_data,
-            common.get_u85_compile_spec(),
+            common.get_u55_compile_spec(),
         )
 
     @pytest.mark.corstone_fvp
-    @conftest.expectedFailureOnFVP
-    # TODO MLETORCH-598
-    def test_u55_BI(self):
+    def test_u85_BI(self):
         module = self.MultipleOutputsModule()
         test_data = module.get_inputs()
         self._test_ethosu_BI_pipeline(
             module,
             test_data,
-            common.get_u55_compile_spec(),
+            common.get_u85_compile_spec(),
         )
diff --git a/backends/arm/test/misc/test_partition_decomposed_quantized_ops.py b/backends/arm/test/misc/test_partition_decomposed_quantized_ops.py
new file mode 100644
index 0000000000..4bcae4930a
--- /dev/null
+++ b/backends/arm/test/misc/test_partition_decomposed_quantized_ops.py
@@ -0,0 +1,65 @@
+# Copyright 2025 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# Test that tosa_supported_operators reject operators that are not
+# quantized properly. This is typically a consequence of a torch op
+# such a Softplus that is decompsed into many other ops without
+# surrounding q/dq nodes.
+
+from typing import Tuple
+
+import torch
+from executorch.backends.arm.test import common
+
+from executorch.backends.arm.test.tester.test_pipeline import (
+    TosaPipelineBI,
+    TosaPipelineMI,
+)
+
+input_t1 = Tuple[torch.Tensor]
+aten_op: list[str] = ["torch.ops.aten.add.Tensor", "torch.ops.aten.softplus.default"]
+exir_op: list[str] = [
+    "executorch_exir_dialects_edge__ops_aten_add_Tensor",
+    "executorch_exir_dialects_edge__ops_aten_mul_Tensor",
+    "executorch_exir_dialects_edge__ops_aten_exp_default",
+    "executorch_exir_dialects_edge__ops_aten_div_Tensor",
+]
+
+
+test_data: dict[input_t1] = {
+    "3d_rand": (torch.rand(1, 5, 5),),
+}
+
+
+class Module(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.softplus = torch.nn.Softplus()
+
+    def forward(self, x: torch.Tensor):
+        return self.softplus(x + x)
+
+
+@common.parametrize("test_data", test_data)
+def test_softplus_tosa_MI(test_data: input_t1):
+    pipeline = TosaPipelineMI[input_t1](
+        Module(), test_data=test_data, aten_op=aten_op, exir_op=exir_op
+    )
+    # remove check_count.exir as there will be more than one delegate
+    pipeline.pop_stage("check_count.exir")
+    pipeline.run()
+
+
+@common.parametrize("test_data", test_data)
+def test_softplus_tosa_BI(test_data: input_t1):
+    pipeline = TosaPipelineBI[input_t1](
+        Module(), test_data=test_data, aten_op=aten_op, exir_op=exir_op
+    )
+    pipeline.pop_stage("check_not.exir")
+    # check that all ops in exir_op except add are rejected
+    pipeline.add_stage_after(
+        "partition", pipeline.tester.check, exir_op[1:], suffix="exir_post_partition"
+    )
+    pipeline.run()
diff --git a/backends/arm/test/models/test_w2l_arm.py b/backends/arm/test/models/test_w2l_arm.py
new file mode 100644
index 0000000000..184216e0ef
--- /dev/null
+++ b/backends/arm/test/models/test_w2l_arm.py
@@ -0,0 +1,150 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# Copyright 2024-2025 Arm Limited and/or its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+import unittest
+from typing import Tuple
+
+import pytest
+
+import torch
+from executorch.backends.arm.test import common, conftest
+from executorch.backends.arm.test.tester.arm_tester import ArmTester
+
+from executorch.exir.backend.compile_spec_schema import CompileSpec
+from torchaudio import models
+
+
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+
+
+def get_test_inputs(batch_size, num_features, input_frames):
+    return (torch.randn(batch_size, num_features, input_frames),)
+
+
+class TestW2L(unittest.TestCase):
+    """Tests Wav2Letter."""
+
+    batch_size = 10
+    input_frames = 400
+    num_features = 1
+
+    w2l = models.Wav2Letter(num_features=num_features).eval()
+    model_example_inputs = get_test_inputs(batch_size, num_features, input_frames)
+
+    all_operators = {
+        "executorch_exir_dialects_edge__ops_aten_convolution_default",
+        "executorch_exir_dialects_edge__ops_aten__log_softmax_default",
+        "executorch_exir_dialects_edge__ops_aten_relu_default",
+    }
+
+    operators_after_quantization = all_operators - {
+        "executorch_exir_dialects_edge__ops_aten__log_softmax_default",
+    }
+
+    @pytest.mark.slow  # about 3min on std laptop
+    def test_w2l_tosa_MI(self):
+        (
+            ArmTester(
+                self.w2l,
+                example_inputs=self.model_example_inputs,
+                compile_spec=common.get_tosa_compile_spec("TOSA-0.80+MI"),
+            )
+            .export()
+            .dump_operator_distribution()
+            .to_edge_transform_and_lower()
+            .dump_operator_distribution()
+            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
+            .to_executorch()
+            .run_method_and_compare_outputs(
+                inputs=get_test_inputs(
+                    self.batch_size, self.num_features, self.input_frames
+                )
+            )
+        )
+
+    @pytest.mark.slow  # about 1min on std laptop
+    def test_w2l_tosa_BI(self):
+        (
+            ArmTester(
+                self.w2l,
+                example_inputs=self.model_example_inputs,
+                compile_spec=common.get_tosa_compile_spec("TOSA-0.80+BI"),
+            )
+            .quantize()
+            .export()
+            .dump_operator_distribution()
+            .to_edge_transform_and_lower()
+            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
+            .to_executorch()
+            .run_method_and_compare_outputs(
+                atol=0.1,
+                qtol=1,
+                inputs=get_test_inputs(
+                    self.batch_size, self.num_features, self.input_frames
+                ),
+            )
+        )
+
+    def _test_w2l_ethos_BI_pipeline(
+        self,
+        module: torch.nn.Module,
+        test_data: Tuple[torch.Tensor],
+        compile_spec: CompileSpec,
+    ):
+        tester = (
+            ArmTester(module, example_inputs=test_data, compile_spec=compile_spec)
+            .quantize()
+            .export()
+            .to_edge()
+            .check(list(self.operators_after_quantization))
+            .partition()
+            .to_executorch()
+            .serialize()
+        )
+        return tester
+
+    # TODO: expected fail as TOSA.Transpose is not supported by Ethos-U55
+    @pytest.mark.slow
+    @pytest.mark.corstone_fvp
+    @conftest.expectedFailureOnFVP
+    def test_w2l_u55_BI(self):
+        tester = self._test_w2l_ethos_BI_pipeline(
+            self.w2l,
+            self.model_example_inputs,
+            common.get_u55_compile_spec(),
+        )
+
+        if conftest.is_option_enabled("corstone_fvp"):
+            tester.run_method_and_compare_outputs(
+                atol=1.0,
+                qtol=1,
+                inputs=get_test_inputs(
+                    self.batch_size, self.num_features, self.input_frames
+                ),
+            )
+
+    @pytest.mark.slow
+    @pytest.mark.corstone_fvp
+    @unittest.skip("Blocked by MLBEDSW-10420")
+    @conftest.expectedFailureOnFVP  # TODO: MLBEDSW-10093
+    def test_w2l_u85_BI(self):
+        tester = self._test_w2l_ethos_BI_pipeline(
+            self.w2l,
+            self.model_example_inputs,
+            common.get_u85_compile_spec(),
+        )
+
+        if conftest.is_option_enabled("corstone_fvp"):
+            tester.run_method_and_compare_outputs(
+                atol=1.0,
+                qtol=1,
+                inputs=get_test_inputs(
+                    self.batch_size, self.num_features, self.input_frames
+                ),
+            )
diff --git a/backends/arm/test/ops/test_abs.py b/backends/arm/test/ops/test_abs.py
new file mode 100644
index 0000000000..481c7d5ed0
--- /dev/null
+++ b/backends/arm/test/ops/test_abs.py
@@ -0,0 +1,125 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# Copyright 2025 Arm Limited and/or its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import unittest
+
+from typing import Tuple
+
+import pytest
+
+import torch
+from executorch.backends.arm.test import common, conftest
+from executorch.backends.arm.test.tester.arm_tester import ArmTester
+from executorch.exir.backend.compile_spec_schema import CompileSpec
+from parameterized import parameterized
+
+
+class TestAbs(unittest.TestCase):
+    class Abs(torch.nn.Module):
+        test_parameters = [
+            (torch.zeros(5),),
+            (torch.full((5,), -1, dtype=torch.float32),),
+            (torch.ones(5) * -1,),
+            (torch.randn(8),),
+            (torch.randn(2, 3, 4),),
+            (torch.randn(1, 2, 3, 4),),
+            (torch.normal(mean=0, std=10, size=(2, 3, 4)),),
+        ]
+
+        def forward(self, x):
+            return torch.abs(x)
+
+    def _test_abs_tosa_MI_pipeline(
+        self, module: torch.nn.Module, test_data: Tuple[torch.Tensor]
+    ):
+        (
+            ArmTester(
+                module,
+                example_inputs=test_data,
+                compile_spec=common.get_tosa_compile_spec("TOSA-0.80+MI"),
+            )
+            .export()
+            .check_count({"torch.ops.aten.abs.default": 1})
+            .check_not(["torch.ops.quantized_decomposed"])
+            .to_edge()
+            .partition()
+            .check_not(["torch.ops.aten.abs.default"])
+            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
+            .to_executorch()
+            .run_method_and_compare_outputs(inputs=test_data)
+        )
+
+    def _test_abs_tosa_BI_pipeline(
+        self, module: torch.nn.Module, test_data: Tuple[torch.Tensor]
+    ):
+        (
+            ArmTester(
+                module,
+                example_inputs=test_data,
+                compile_spec=common.get_tosa_compile_spec("TOSA-0.80+BI"),
+            )
+            .quantize()
+            .export()
+            .check_count({"torch.ops.aten.abs.default": 1})
+            .check(["torch.ops.quantized_decomposed"])
+            .to_edge()
+            .partition()
+            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
+            .to_executorch()
+            .run_method_and_compare_outputs(inputs=test_data, qtol=1)
+        )
+
+    def _test_abs_ethosu_BI_pipeline(
+        self,
+        compile_spec: list[CompileSpec],
+        module: torch.nn.Module,
+        test_data: Tuple[torch.Tensor],
+    ):
+        tester = (
+            ArmTester(
+                module,
+                example_inputs=test_data,
+                compile_spec=compile_spec,
+            )
+            .quantize()
+            .export()
+            .check_count({"torch.ops.aten.abs.default": 1})
+            .check(["torch.ops.quantized_decomposed"])
+            .to_edge()
+            .partition()
+            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
+            .to_executorch()
+            .serialize()
+        )
+        if conftest.is_option_enabled("corstone_fvp"):
+            tester.run_method_and_compare_outputs(qtol=1, inputs=test_data)
+
+    @parameterized.expand(Abs.test_parameters)
+    def test_abs_tosa_MI(self, test_data: torch.Tensor):
+        test_data = (test_data,)
+        self._test_abs_tosa_MI_pipeline(self.Abs(), test_data)
+
+    @parameterized.expand(Abs.test_parameters)
+    def test_abs_tosa_BI(self, test_data: torch.Tensor):
+        test_data = (test_data,)
+        self._test_abs_tosa_BI_pipeline(self.Abs(), test_data)
+
+    @parameterized.expand(Abs.test_parameters)
+    @pytest.mark.corstone_fvp
+    def test_abs_u55_BI(self, test_data: torch.Tensor):
+        test_data = (test_data,)
+        self._test_abs_ethosu_BI_pipeline(
+            common.get_u55_compile_spec(), self.Abs(), test_data
+        )
+
+    @parameterized.expand(Abs.test_parameters)
+    @pytest.mark.corstone_fvp
+    def test_abs_u85_BI(self, test_data: torch.Tensor):
+        test_data = (test_data,)
+        self._test_abs_ethosu_BI_pipeline(
+            common.get_u85_compile_spec(), self.Abs(), test_data
+        )
diff --git a/backends/arm/test/ops/test_bmm.py b/backends/arm/test/ops/test_bmm.py
index 46b6eb6d01..d7214f7622 100644
--- a/backends/arm/test/ops/test_bmm.py
+++ b/backends/arm/test/ops/test_bmm.py
@@ -150,9 +150,10 @@ def test_bmm_single_input_tosa_BI(self, test_data_generator: Callable[[], Tuple]
         test_data = test_data_generator()
         self._test_bmm_tosa_BI_pipeline(self.BMMSingleInput(), test_data)
 
+    # Expected to fail on FVP as TOSA.MATMUL is not supported on U55
     @parameterized.expand(BMM.test_data_generators)
     @pytest.mark.corstone_fvp
-    @unittest.expectedFailure
+    @conftest.expectedFailureOnFVP
     def test_bmm_u55_BI_xfails(self, test_data_generator: Callable[[], Tuple]):
         test_data = test_data_generator()
         self._test_bmm_ethosu_BI_pipeline(
@@ -167,10 +168,10 @@ def test_bmm_u85_BI(self, test_data_generator: Callable[[], Tuple]):
             self.BMM(), common.get_u85_compile_spec(), test_data
         )
 
-    # Expected to fail with error: Warning, unsupported fusing of TOSA Rescale previous operator is of type: Memcpy
+    # Expected to fail on FVP as TOSA.MATMUL is not supported on U55
     @parameterized.expand(BMMSingleInput.test_data_generators)
     @pytest.mark.corstone_fvp
-    @unittest.expectedFailure
+    @conftest.expectedFailureOnFVP
     def test_bmm_single_input_u55_BI_xfails(
         self, test_data_generator: Callable[[], Tuple]
     ):
diff --git a/backends/arm/test/ops/test_cat.py b/backends/arm/test/ops/test_cat.py
index a1613d1d04..63423b9e99 100644
--- a/backends/arm/test/ops/test_cat.py
+++ b/backends/arm/test/ops/test_cat.py
@@ -111,7 +111,6 @@ def _test_cat_ethosu_BI_pipeline(
             .check(["torch.ops.quantized_decomposed"])
             .to_edge()
             .partition()
-            .dump_artifact()
             .check_not(["executorch_exir_dialects_edge__ops_aten_cat_default"])
             .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
             .to_executorch()
diff --git a/backends/arm/test/ops/test_floor.py b/backends/arm/test/ops/test_floor.py
new file mode 100644
index 0000000000..c19dc8605b
--- /dev/null
+++ b/backends/arm/test/ops/test_floor.py
@@ -0,0 +1,82 @@
+# Copyright 2025 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+from typing import Tuple
+
+import torch
+from executorch.backends.arm.test import common
+from executorch.backends.arm.test.tester.test_pipeline import (
+    EthosU55PipelineBI,
+    EthosU85PipelineBI,
+    TosaPipelineBI,
+    TosaPipelineMI,
+)
+
+
+aten_op = "torch.ops.aten.floor.default"
+exir_op = "executorch_exir_dialects_edge__ops_aten_floor_default"
+
+input_t1 = Tuple[torch.Tensor]  # Input x
+
+
+class Floor(torch.nn.Module):
+    def forward(self, x: torch.Tensor):
+        return torch.floor(x)
+
+    test_data: dict[str, input_t1] = {
+        "zeros": (torch.zeros(1, 10, 10, 10),),
+        "ones": (torch.ones(10, 10, 10),),
+        "rand": ((torch.rand(10, 10) - 0.5),),
+        "randn_pos": ((torch.randn(1, 4, 4, 4) + 10),),
+        "randn_neg": ((torch.randn(1, 4, 4, 4) - 10),),
+        "ramp": (torch.arange(-16, 16, 0.2),),
+    }
+
+
+@common.parametrize("test_data", Floor.test_data)
+def test_floor_tosa_MI(test_data: input_t1):
+    pipeline = TosaPipelineMI[input_t1](Floor(), test_data, aten_op, exir_op)
+    pipeline.run()
+
+
+@common.parametrize("test_data", Floor.test_data)
+def test_floor_tosa_BI(test_data: input_t1):
+    pipeline = TosaPipelineBI[input_t1](Floor(), test_data, aten_op, exir_op)
+    pipeline.run()
+
+
+@common.parametrize("test_data", Floor.test_data)
+def test_floor_u55_BI(test_data: input_t1):
+    pipeline = EthosU55PipelineBI[input_t1](
+        Floor(), test_data, aten_op, exir_op, run_on_fvp=False
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_data", Floor.test_data)
+def test_floor_u85_BI(test_data: input_t1):
+    pipeline = EthosU85PipelineBI[input_t1](
+        Floor(), test_data, aten_op, exir_op, run_on_fvp=False
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_data", Floor.test_data)
+@common.SkipIfNoCorstone300
+def test_floor_u55_BI_on_fvp(test_data: input_t1):
+    pipeline = EthosU55PipelineBI[input_t1](
+        Floor(), test_data, aten_op, exir_op, run_on_fvp=True
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_data", Floor.test_data)
+@common.SkipIfNoCorstone320
+def test_floor_u85_BI_on_fvp(test_data: input_t1):
+    pipeline = EthosU85PipelineBI[input_t1](
+        Floor(), test_data, aten_op, exir_op, run_on_fvp=True
+    )
+    pipeline.run()
diff --git a/backends/arm/test/ops/test_layer_norm.py b/backends/arm/test/ops/test_layer_norm.py
index 82f0af8dcf..a2a42189cd 100644
--- a/backends/arm/test/ops/test_layer_norm.py
+++ b/backends/arm/test/ops/test_layer_norm.py
@@ -158,7 +158,7 @@ def test_layer_norm_tosa_BI(
             self.LayerNorm(*model_params), (test_data,)
         )
 
-    @parameterized.expand(test_data_suite[4:])
+    @parameterized.expand(test_data_suite)
     @pytest.mark.corstone_fvp
     def test_layer_norm_u55_BI(
         self,
@@ -170,36 +170,7 @@ def test_layer_norm_u55_BI(
             self.LayerNorm(*model_params), common.get_u55_compile_spec(), (test_data,)
         )
 
-    # Numerical issues on FVP likely due to mul op, MLETORCH-521
-    # Skip tests that require transposes.
-    @parameterized.expand(test_data_suite[:4])
-    @pytest.mark.corstone_fvp
-    @conftest.expectedFailureOnFVP
-    def test_layer_norm_u55_BI_xfails(
-        self,
-        test_name: str,
-        test_data: torch.Tensor,
-        model_params,
-    ):
-        self._test_layernorm_ethosu_BI_pipeline(
-            self.LayerNorm(*model_params), common.get_u55_compile_spec(), (test_data,)
-        )
-
-    # Numerical issues on FVP likely due to mul op, MLETORCH-521
-    @parameterized.expand(test_data_suite[:-2])
-    @pytest.mark.corstone_fvp
-    @conftest.expectedFailureOnFVP
-    def test_layer_norm_u85_BI_xfails(
-        self,
-        test_name: str,
-        test_data: torch.Tensor,
-        model_params,
-    ):
-        self._test_layernorm_ethosu_BI_pipeline(
-            self.LayerNorm(*model_params), common.get_u85_compile_spec(), (test_data,)
-        )
-
-    @parameterized.expand(test_data_suite[-2:])
+    @parameterized.expand(test_data_suite)
     @pytest.mark.corstone_fvp
     def test_layer_norm_u85_BI(
         self,
diff --git a/backends/arm/test/ops/test_logsoftmax.py b/backends/arm/test/ops/test_logsoftmax.py
index f34d4afbb5..bd48bd224a 100644
--- a/backends/arm/test/ops/test_logsoftmax.py
+++ b/backends/arm/test/ops/test_logsoftmax.py
@@ -11,7 +11,7 @@
 import pytest
 
 import torch
-from executorch.backends.arm.test import common
+from executorch.backends.arm.test import common, conftest
 from executorch.backends.arm.test.tester.arm_tester import ArmTester
 from executorch.exir.backend.compile_spec_schema import CompileSpec
 from parameterized import parameterized
@@ -28,16 +28,17 @@
     lambda: ("randn", torch.randn(10, 10, 10, 10), 3),
     lambda: ("randn_neg_dim", torch.randn(10, 5, 8, 7), -3),
 ]
-test_data_generators_u55 = [
+
+test_data_generators_FVP = [
     # (test_name, test_data, dim)
     lambda: ("ones", torch.ones(10, 10), 1),
     lambda: ("ones_neg_dim", torch.ones(10, 3, 4), -1),
-    lambda: ("randn_neg_dim", torch.randn(10, 5, 8, 7), -3),
-    lambda: ("zeros", torch.zeros(10, 8, 5, 2), 0),
-    lambda: ("zeros_neg_dim", torch.zeros(10, 7, 8, 9), -4),
+    lambda: ("randn_neg_dim", torch.randn(1, 5, 8, 7), -3),
+    lambda: ("zeros", torch.zeros(1, 8, 5, 2), 0),
+    lambda: ("zeros_neg_dim", torch.zeros(1, 7, 8, 9), -4),
     lambda: ("rand", torch.rand(1, 2, 5, 8), 2),
-    lambda: ("rand_neg_dim", torch.rand(2, 10, 8, 10), -2),
-    lambda: ("randn", torch.randn(10, 10, 10, 10), 3),
+    lambda: ("rand_neg_dim", torch.rand(1, 10, 8, 10), -2),
+    lambda: ("randn", torch.randn(1, 10, 10, 10), 3),
 ]
 
 
@@ -99,7 +100,7 @@ def _test_logsoftmax_tosa_ethos_BI_pipeline(
         module: torch.nn.Module,
         test_data: Tuple[torch.tensor],
     ):
-        (
+        tester = (
             ArmTester(
                 module,
                 example_inputs=test_data,
@@ -114,21 +115,10 @@ def _test_logsoftmax_tosa_ethos_BI_pipeline(
             .check_not(["executorch_exir_dialects_edge__ops_aten__logsoftmax_default"])
             .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
             .to_executorch()
+            .serialize()
         )
-
-    def _test_logsoftmax_tosa_u55_BI_pipeline(
-        self, module: torch.nn.Module, test_data: Tuple[torch.tensor]
-    ):
-        self._test_logsoftmax_tosa_ethos_BI_pipeline(
-            common.get_u55_compile_spec(), module, test_data
-        )
-
-    def _test_logsoftmax_tosa_u85_BI_pipeline(
-        self, module: torch.nn.Module, test_data: Tuple[torch.tensor]
-    ):
-        self._test_logsoftmax_tosa_ethos_BI_pipeline(
-            common.get_u85_compile_spec(), module, test_data
-        )
+        if conftest.is_option_enabled("corstone_fvp"):
+            tester.run_method_and_compare_outputs(inputs=test_data, qtol=1)
 
     @parameterized.expand(test_data_generators)
     def test_logsoftmax_tosa_MI(self, test_data_generator: Callable[[], Tuple]):
@@ -141,18 +131,18 @@ def test_logsoftmax_tosa_BI(self, test_data_generator: Callable[[], Tuple]):
         test_name, test_data, dim = test_data_generator()
         self._test_logsoftmax_tosa_BI_pipeline(self.LogSoftmax(dim=dim), (test_data,))
 
-    @parameterized.expand(test_data_generators_u55)
+    @parameterized.expand(test_data_generators_FVP)
     @pytest.mark.flaky  # TODO: MLETORCH-460 - Numerically stabler (log)softmax implementation
     def test_logsoftmax_tosa_u55_BI(self, test_data_generator: Callable[[], Tuple]):
         test_name, test_data, dim = test_data_generator()
-        self._test_logsoftmax_tosa_u55_BI_pipeline(
-            self.LogSoftmax(dim=dim), (test_data,)
+        self._test_logsoftmax_tosa_ethos_BI_pipeline(
+            common.get_u55_compile_spec(), self.LogSoftmax(dim=dim), (test_data,)
         )
 
-    @parameterized.expand(test_data_generators)
+    @parameterized.expand(test_data_generators_FVP)
     @pytest.mark.flaky  # TODO: MLETORCH-460 - Numerically stabler (log)softmax implementation
     def test_logsoftmax_tosa_u85_BI(self, test_data_generator: Callable[[], Tuple]):
         test_name, test_data, dim = test_data_generator()
-        self._test_logsoftmax_tosa_u85_BI_pipeline(
-            self.LogSoftmax(dim=dim), (test_data,)
+        self._test_logsoftmax_tosa_ethos_BI_pipeline(
+            common.get_u85_compile_spec(), self.LogSoftmax(dim=dim), (test_data,)
         )
diff --git a/backends/arm/test/ops/test_mean_dim.py b/backends/arm/test/ops/test_mean_dim.py
index 393cf1667e..78997ac047 100644
--- a/backends/arm/test/ops/test_mean_dim.py
+++ b/backends/arm/test/ops/test_mean_dim.py
@@ -10,7 +10,7 @@
 from typing import Tuple
 
 import torch
-from executorch.backends.arm.test import common
+from executorch.backends.arm.test import common, conftest
 from executorch.backends.arm.test.tester.arm_tester import ArmTester
 from executorch.exir.backend.backend_details import CompileSpec
 from parameterized import parameterized
@@ -121,7 +121,7 @@ def _test_adaptive_avg_pool2d_tosa_ethosu_BI_pipeline(
         compile_spec: CompileSpec,
         test_data: Tuple[torch.tensor],
     ):
-        (
+        tester = (
             ArmTester(
                 module,
                 example_inputs=test_data,
@@ -141,7 +141,10 @@ def _test_adaptive_avg_pool2d_tosa_ethosu_BI_pipeline(
             )
             .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
             .to_executorch()
+            .serialize()
         )
+        if conftest.is_option_enabled("corstone_fvp"):
+            tester.run_method_and_compare_outputs(inputs=test_data)
 
     def _test_meandim_tosa_MI_pipeline(
         self, module: torch.nn.Module, test_data: Tuple[torch.tensor]
@@ -188,7 +191,7 @@ def _test_meandim_tosa_ethosu_BI_pipeline(
         compile_spec: CompileSpec,
         test_data: Tuple[torch.tensor],
     ):
-        (
+        tester = (
             ArmTester(
                 module,
                 example_inputs=test_data,
@@ -207,7 +210,10 @@ def _test_meandim_tosa_ethosu_BI_pipeline(
             )
             .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
             .to_executorch()
+            .serialize()
         )
+        if conftest.is_option_enabled("corstone_fvp"):
+            tester.run_method_and_compare_outputs(inputs=test_data, qtol=1)
 
     @parameterized.expand(AdaptiveAveragePool2d.test_data_suite)
     def test_adaptive_avg_pool2d_tosa_MI(
diff --git a/backends/arm/test/ops/test_mm.py b/backends/arm/test/ops/test_mm.py
index df75e4ed18..347a0b297f 100644
--- a/backends/arm/test/ops/test_mm.py
+++ b/backends/arm/test/ops/test_mm.py
@@ -132,18 +132,16 @@ def test_mm_single_input_tosa_BI(self, test_data_generator: Callable[[], Tuple])
         test_data = test_data_generator()
         self._test_mm_tosa_BI_pipeline(self.MMSingleInput(), test_data)
 
-    # Expected to fail with error: CPU performance estimation for "MatMul" not implemented
+    # TODO: Enable numerical testing
     @parameterized.expand(MM.test_data_generators)
-    @unittest.expectedFailure
     def test_mm_u55_BI(self, test_data_generator: Callable[[], Tuple]):
         test_data = test_data_generator()
         self._test_mm_ethosu_BI_pipeline(
             common.get_u55_compile_spec(), self.MM(), test_data
         )
 
-    # Expected to fail with error: Warning, unsupported fusing of TOSA Rescale previous operator is of type: Memcpy
+    # TODO: Enable numerical testing
     @parameterized.expand(MMSingleInput.test_data_generators)
-    @unittest.expectedFailure
     def test_mm_single_input_u55_BI(self, test_data_generator: Callable[[], Tuple]):
         test_data = test_data_generator()
         self._test_mm_ethosu_BI_pipeline(
diff --git a/backends/arm/test/ops/test_select.py b/backends/arm/test/ops/test_select.py
index b474da573f..fbeb4ebf9e 100644
--- a/backends/arm/test/ops/test_select.py
+++ b/backends/arm/test/ops/test_select.py
@@ -19,7 +19,7 @@
 test_data_suite: list[tuple[test_data_t]] = [
     # (test_data, dim, index)
     ((torch.zeros(5, 3, 20), -1, 0),),
-    ((torch.zeros(5, 3, 20), 0, -1),),
+    ((torch.rand(5, 3, 20), 0, -1),),
     ((torch.zeros(5, 3, 20), 0, 4),),
     ((torch.ones(10, 10, 10), 0, 2),),
     ((torch.rand(5, 3, 20, 2), 0, 2),),
@@ -61,9 +61,7 @@ def _test_select_tosa_MI_pipeline(
             .check([export_target])
             .check_not(["torch.ops.quantized_decomposed"])
             .to_edge()
-            .dump_artifact()
             .partition()
-            .dump_artifact()
             .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
             .to_executorch()
             .run_method_and_compare_outputs(inputs=test_data)
diff --git a/backends/arm/test/ops/test_softmax.py b/backends/arm/test/ops/test_softmax.py
index c60da18594..787e1b73a3 100644
--- a/backends/arm/test/ops/test_softmax.py
+++ b/backends/arm/test/ops/test_softmax.py
@@ -12,7 +12,7 @@
 import pytest
 
 import torch
-from executorch.backends.arm.test import common
+from executorch.backends.arm.test import common, conftest
 from executorch.backends.arm.test.tester.arm_tester import ArmTester
 from executorch.exir.backend.compile_spec_schema import CompileSpec
 from parameterized import parameterized
@@ -30,16 +30,16 @@
     lambda: ("randn_neg_dim", torch.randn(10, 5, 8, 7), -3),
 ]
 
-test_data_generators_u55 = [
+test_data_generators_FVP = [
     # (test_name, test_data, dim)
     lambda: ("ones", torch.ones(10, 10), 1),
-    lambda: ("ones_neg_dim", torch.ones(10, 3, 4), -1),
-    lambda: ("randn_neg_dim", torch.randn(10, 5, 8, 7), -3),
-    lambda: ("zeros", torch.zeros(10, 8, 5, 2), 0),
-    lambda: ("zeros_neg_dim", torch.zeros(10, 7, 8, 9), -4),
+    lambda: ("ones_neg_dim", torch.ones(1, 3, 4), -1),
+    lambda: ("randn_neg_dim", torch.randn(1, 5, 8, 7), -3),
+    lambda: ("zeros", torch.zeros(1, 8, 5, 2), 0),
+    lambda: ("zeros_neg_dim", torch.zeros(1, 7, 8, 9), -4),
     lambda: ("rand", torch.rand(1, 2, 5, 8), 2),
-    lambda: ("rand_neg_dim", torch.rand(2, 10, 8, 10), -2),
-    lambda: ("randn", torch.randn(10, 10, 10, 10), 3),
+    lambda: ("rand_neg_dim", torch.rand(1, 10, 8, 10), -2),
+    lambda: ("randn", torch.randn(1, 10, 10, 10), 3),
 ]
 
 
@@ -95,13 +95,13 @@ def _test_softmax_tosa_BI_pipeline(
             .run_method_and_compare_outputs(inputs=test_data)
         )
 
-    def _test_softmax_tosa_ethos_BI_pipeline(
+    def _test_softmax_ethosu_BI_pipeline(
         self,
         compile_spec: list[CompileSpec],
         module: torch.nn.Module,
         test_data: Tuple[torch.tensor],
     ):
-        (
+        tester = (
             ArmTester(
                 module,
                 example_inputs=test_data,
@@ -116,21 +116,10 @@ def _test_softmax_tosa_ethos_BI_pipeline(
             .check_not(["executorch_exir_dialects_edge__ops_aten__softmax_default"])
             .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
             .to_executorch()
+            .serialize()
         )
-
-    def _test_softmax_tosa_u55_BI_pipeline(
-        self, module: torch.nn.Module, test_data: Tuple[torch.tensor]
-    ):
-        self._test_softmax_tosa_ethos_BI_pipeline(
-            common.get_u55_compile_spec(), module, test_data
-        )
-
-    def _test_softmax_tosa_u85_BI_pipeline(
-        self, module: torch.nn.Module, test_data: Tuple[torch.tensor]
-    ):
-        self._test_softmax_tosa_ethos_BI_pipeline(
-            common.get_u85_compile_spec(), module, test_data
-        )
+        if conftest.is_option_enabled("corstone_fvp"):
+            tester.run_method_and_compare_outputs(inputs=test_data, qtol=1)
 
     @parameterized.expand(test_data_generators)
     def test_softmax_tosa_MI(self, test_data_generator: Callable[[], Tuple]):
@@ -143,14 +132,18 @@ def test_softmax_tosa_BI(self, test_data_generator: Callable[[], Tuple]):
         test_name, test_data, dim = test_data_generator()
         self._test_softmax_tosa_BI_pipeline(self.Softmax(dim=dim), (test_data,))
 
-    @parameterized.expand(test_data_generators_u55)
+    @parameterized.expand(test_data_generators_FVP)
     @pytest.mark.flaky  # TODO: MLETORCH-460 - Numerically stabler (log)softmax implementation
-    def test_softmax_tosa_u55_BI(self, test_data_generator: Callable[[], Tuple]):
+    def test_softmax_u55_BI(self, test_data_generator: Callable[[], Tuple]):
         test_name, test_data, dim = test_data_generator()
-        self._test_softmax_tosa_u55_BI_pipeline(self.Softmax(dim=dim), (test_data,))
+        self._test_softmax_ethosu_BI_pipeline(
+            common.get_u55_compile_spec(), self.Softmax(dim=dim), (test_data,)
+        )
 
-    @parameterized.expand(test_data_generators)
+    @parameterized.expand(test_data_generators_FVP)
     @pytest.mark.flaky  # TODO: MLETORCH-460 - Numerically stabler (log)softmax implementation
-    def test_softmax_tosa_u85_BI(self, test_data_generator: Callable[[], Tuple]):
+    def test_softmax_u85_BI(self, test_data_generator: Callable[[], Tuple]):
         test_name, test_data, dim = test_data_generator()
-        self._test_softmax_tosa_u85_BI_pipeline(self.Softmax(dim=dim), (test_data,))
+        self._test_softmax_ethosu_BI_pipeline(
+            common.get_u85_compile_spec(), self.Softmax(dim=dim), (test_data,)
+        )
diff --git a/backends/arm/test/ops/test_sum.py b/backends/arm/test/ops/test_sum.py
index 5627c55ad9..bc0c50b8ee 100644
--- a/backends/arm/test/ops/test_sum.py
+++ b/backends/arm/test/ops/test_sum.py
@@ -9,7 +9,7 @@
 from typing import Tuple
 
 import torch
-from executorch.backends.arm.test import common
+from executorch.backends.arm.test import common, conftest
 from executorch.backends.arm.test.tester.arm_tester import ArmTester
 from executorch.exir.backend.compile_spec_schema import CompileSpec
 from parameterized import parameterized
@@ -29,7 +29,7 @@ class Sum(torch.nn.Module):
             ((torch.rand(10), 0, True),),
             ((torch.rand(10, 10), 1, False),),
             ((torch.rand(10, 10, 10), [-3, 1], True),),
-            ((torch.rand(2, 1, 5, 8), 1, False),),
+            ((torch.rand(1, 1, 5, 8), 1, False),),
             ((torch.rand(1, 2, 3, 4), 3, True),),
             ((torch.rand(1, 2, 8, 8), [2, 3, 0], True),),
         ]
@@ -39,7 +39,7 @@ class Sum(torch.nn.Module):
             ((torch.rand(10, 10), 1, False),),
             ((torch.rand(1, 2, 3, 4), 3, True),),
             ((torch.rand(10, 10, 10), [-3, 1], True),),
-            ((torch.rand(2, 1, 5, 8), 1, False),),
+            ((torch.rand(1, 1, 5, 8), 1, False),),
             ((torch.rand(1, 2, 8, 8), [2, 3, 0], True),),
         ]
 
@@ -82,7 +82,7 @@ def _test_sum_tosa_BI_pipeline(
             .partition()
             .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
             .to_executorch()
-            .run_method_and_compare_outputs(inputs=test_data, qtol=1)
+            .run_method_and_compare_outputs(inputs=test_data)
         )
 
     def _test_sum_ethosu_BI_pipeline(
@@ -91,7 +91,7 @@ def _test_sum_ethosu_BI_pipeline(
         test_data: tuple[exampledata_t],
         compile_spec: CompileSpec,
     ):
-        (
+        tester = (
             ArmTester(
                 module,
                 example_inputs=test_data,
@@ -107,6 +107,8 @@ def _test_sum_ethosu_BI_pipeline(
             .to_executorch()
             .serialize()
         )
+        if conftest.is_option_enabled("corstone_fvp"):
+            tester.run_method_and_compare_outputs(inputs=test_data, qtol=1)
 
     @parameterized.expand(Sum.test_parameters)
     def test_sum_tosa_MI(self, test_data: tuple[exampledata_t]):
diff --git a/backends/arm/test/ops/test_to_copy.py b/backends/arm/test/ops/test_to_copy.py
index 6992ac2f8e..db3e93fbdc 100644
--- a/backends/arm/test/ops/test_to_copy.py
+++ b/backends/arm/test/ops/test_to_copy.py
@@ -1,4 +1,4 @@
-# Copyright 2024 Arm Limited and/or its affiliates.
+# Copyright 2024-2025 Arm Limited and/or its affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the BSD-style license found in the
@@ -55,9 +55,7 @@ def _test_to_copy_tosa_MI_pipeline(
                 compile_spec=common.get_tosa_compile_spec("TOSA-0.80+MI"),
             )
             .export()
-            .dump_artifact()
             .to_edge()
-            .dump_artifact()
             .partition()
             .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
             .to_executorch()
diff --git a/backends/arm/test/ops/test_var.py b/backends/arm/test/ops/test_var.py
index ad095f01de..6690c668f9 100644
--- a/backends/arm/test/ops/test_var.py
+++ b/backends/arm/test/ops/test_var.py
@@ -16,7 +16,7 @@
     get_symmetric_quantization_config,
     TOSAQuantizer,
 )
-from executorch.backends.arm.test import common
+from executorch.backends.arm.test import common, conftest
 from executorch.backends.arm.test.tester.arm_tester import ArmTester
 
 from executorch.backends.arm.tosa_specification import TosaSpecification
@@ -36,13 +36,16 @@ class Var(torch.nn.Module):
             (torch.rand(1, 50, 10, 20), False, 0.5),
         ]
 
+        def __init__(self, keepdim: bool = True, correction: int = 0):
+            super().__init__()
+            self.keepdim = keepdim
+            self.correction = correction
+
         def forward(
             self,
             x: torch.Tensor,
-            keepdim: bool = True,
-            correction: int = 0,
         ):
-            return x.var(keepdim=keepdim, correction=correction)
+            return x.var(keepdim=self.keepdim, correction=self.correction)
 
     class VarDim(torch.nn.Module):
         test_parameters = [
@@ -62,14 +65,17 @@ class VarDim(torch.nn.Module):
             (torch.rand(1, 50, 10, 20), -1, True, True),
         ]
 
+        def __init__(self, dim: int = -1, keepdim: bool = True, unbiased: bool = False):
+            super().__init__()
+            self.dim = dim
+            self.keepdim = keepdim
+            self.unbiased = unbiased
+
         def forward(
             self,
             x: torch.Tensor,
-            dim: int = -1,
-            keepdim: bool = True,
-            unbiased: bool = False,
         ):
-            return x.var(dim=dim, keepdim=keepdim, unbiased=unbiased)
+            return x.var(dim=self.dim, keepdim=self.keepdim, unbiased=self.unbiased)
 
     class VarCorrection(torch.nn.Module):
         test_parameters = [
@@ -79,14 +85,19 @@ class VarCorrection(torch.nn.Module):
             (torch.rand(1, 50, 10, 20), (-1, -2), True, 0.5),
         ]
 
+        def __init__(
+            self, dim: int = -1, keepdim: bool = True, correction: bool = False
+        ):
+            super().__init__()
+            self.dim = dim
+            self.keepdim = keepdim
+            self.correction = correction
+
         def forward(
             self,
             x: torch.Tensor,
-            dim: int | tuple[int] = -1,
-            keepdim: bool = True,
-            correction: int = 0,
         ):
-            return x.var(dim=dim, keepdim=keepdim, correction=correction)
+            return x.var(dim=self.dim, keepdim=self.keepdim, correction=self.correction)
 
     def _test_var_tosa_MI_pipeline(
         self,
@@ -138,7 +149,7 @@ def _test_var_ethosu_BI_pipeline(
         quantizer = EthosUQuantizer(compile_spec).set_io(
             get_symmetric_quantization_config()
         )
-        (
+        tester = (
             ArmTester(
                 module,
                 example_inputs=test_data,
@@ -150,58 +161,61 @@ def _test_var_ethosu_BI_pipeline(
             .partition()
             .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
             .to_executorch()
+            .serialize()
         )
+        if conftest.is_option_enabled("corstone_fvp"):
+            tester.run_method_and_compare_outputs(inputs=test_data, qtol=1)
 
     @parameterized.expand(Var.test_parameters)
     def test_var_tosa_MI(self, test_tensor: torch.Tensor, keepdim, correction):
-        self._test_var_tosa_MI_pipeline(self.Var(), (test_tensor, keepdim, correction))
+        self._test_var_tosa_MI_pipeline(self.Var(keepdim, correction), (test_tensor,))
 
     @parameterized.expand(Var.test_parameters)
     def test_var_tosa_BI(self, test_tensor: torch.Tensor, keepdim, correction):
-        self._test_var_tosa_BI_pipeline(self.Var(), (test_tensor, keepdim, correction))
+        self._test_var_tosa_BI_pipeline(self.Var(keepdim, correction), (test_tensor,))
 
     @parameterized.expand(Var.test_parameters)
     def test_var_u55_BI(self, test_tensor: torch.Tensor, keepdim, correction):
         self._test_var_ethosu_BI_pipeline(
-            self.Var(),
+            self.Var(keepdim, correction),
             common.get_u55_compile_spec(),
-            (test_tensor, keepdim, correction),
+            (test_tensor,),
         )
 
     @parameterized.expand(Var.test_parameters)
     def test_var_u85_BI(self, test_tensor: torch.Tensor, keepdim, correction):
         self._test_var_ethosu_BI_pipeline(
-            self.Var(),
+            self.Var(keepdim, correction),
             common.get_u85_compile_spec(),
-            (test_tensor, keepdim, correction),
+            (test_tensor,),
         )
 
     @parameterized.expand(VarDim.test_parameters)
-    def test_var_dim_tosa_MI(self, test_tensor: torch.Tensor, dim, keepdim, correction):
+    def test_var_dim_tosa_MI(self, test_tensor: torch.Tensor, dim, keepdim, unbiased):
         self._test_var_tosa_MI_pipeline(
-            self.VarDim(), (test_tensor, dim, keepdim, correction)
+            self.VarDim(dim, keepdim, unbiased), (test_tensor,)
         )
 
     @parameterized.expand(VarDim.test_parameters)
-    def test_var_dim_tosa_BI(self, test_tensor: torch.Tensor, dim, keepdim, correction):
+    def test_var_dim_tosa_BI(self, test_tensor: torch.Tensor, dim, keepdim, unbiased):
         self._test_var_tosa_BI_pipeline(
-            self.VarDim(), (test_tensor, dim, keepdim, correction)
+            self.VarDim(dim, keepdim, unbiased), (test_tensor,)
         )
 
     @parameterized.expand(VarDim.test_parameters_u55)
-    def test_var_dim_u55_BI(self, test_tensor: torch.Tensor, dim, keepdim, correction):
+    def test_var_dim_u55_BI(self, test_tensor: torch.Tensor, dim, keepdim, unbiased):
         self._test_var_ethosu_BI_pipeline(
-            self.VarDim(),
+            self.VarDim(dim, keepdim, unbiased),
             common.get_u55_compile_spec(),
-            (test_tensor, dim, keepdim, correction),
+            (test_tensor,),
         )
 
     @parameterized.expand(VarDim.test_parameters)
-    def test_var_dim_u85_BI(self, test_tensor: torch.Tensor, dim, keepdim, correction):
+    def test_var_dim_u85_BI(self, test_tensor: torch.Tensor, dim, keepdim, unbiased):
         self._test_var_ethosu_BI_pipeline(
-            self.VarDim(),
+            self.VarDim(dim, keepdim, unbiased),
             common.get_u85_compile_spec(),
-            (test_tensor, dim, keepdim, correction),
+            (test_tensor,),
         )
 
     @parameterized.expand(VarCorrection.test_parameters)
@@ -209,7 +223,7 @@ def test_var_correction_tosa_MI(
         self, test_tensor: torch.Tensor, dim, keepdim, correction
     ):
         self._test_var_tosa_MI_pipeline(
-            self.VarCorrection(), (test_tensor, dim, keepdim, correction)
+            self.VarCorrection(dim, keepdim, correction), (test_tensor,)
         )
 
     @parameterized.expand(VarCorrection.test_parameters)
@@ -217,7 +231,7 @@ def test_var_correction_tosa_BI(
         self, test_tensor: torch.Tensor, dim, keepdim, correction
     ):
         self._test_var_tosa_BI_pipeline(
-            self.VarCorrection(), (test_tensor, dim, keepdim, correction)
+            self.VarCorrection(dim, keepdim, correction), (test_tensor,)
         )
 
     @parameterized.expand(VarCorrection.test_parameters)
@@ -225,9 +239,9 @@ def test_var_correction_u55_BI(
         self, test_tensor: torch.Tensor, dim, keepdim, correction
     ):
         self._test_var_ethosu_BI_pipeline(
-            self.VarCorrection(),
+            self.VarCorrection(dim, keepdim, correction),
             common.get_u55_compile_spec(),
-            (test_tensor, dim, keepdim, correction),
+            (test_tensor,),
         )
 
     @parameterized.expand(VarCorrection.test_parameters)
@@ -235,7 +249,7 @@ def test_var_correction_u85_BI(
         self, test_tensor: torch.Tensor, dim, keepdim, correction
     ):
         self._test_var_ethosu_BI_pipeline(
-            self.VarCorrection(),
+            self.VarCorrection(dim, keepdim, correction),
             common.get_u85_compile_spec(),
-            (test_tensor, dim, keepdim, correction),
+            (test_tensor,),
         )
diff --git a/backends/arm/test/passes/test_convert_to_clamp.py b/backends/arm/test/passes/test_convert_to_clamp.py
new file mode 100644
index 0000000000..0b106b7bc8
--- /dev/null
+++ b/backends/arm/test/passes/test_convert_to_clamp.py
@@ -0,0 +1,80 @@
+# Copyright 2025 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import unittest
+
+import torch
+from executorch.backends.arm._passes.convert_to_clamp import ConvertToClampPass
+
+from executorch.backends.arm.test import common
+from executorch.backends.arm.test.tester.arm_tester import ArmTester
+
+from executorch.backends.xnnpack.test.tester.tester import RunPasses
+
+
+class HardTanh(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+
+        self.hardtanh = torch.nn.Hardtanh()
+
+    def forward(self, x):
+        return self.hardtanh(x)
+
+    def get_inputs(self):
+        return (torch.rand(1, 64, 64, 3),)
+
+
+class ReLU(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+
+        self.relu = torch.nn.ReLU()
+
+    def forward(self, x):
+        return self.relu(x)
+
+    def get_inputs(self):
+        return (torch.rand(1, 64, 64, 3),)
+
+
+class TestConvertToClampPass(unittest.TestCase):
+    """
+    Tests the ConvertToClampPass which converts hardtanh.default and relu.default to clamp.default
+    """
+
+    def test_tosa_MI_hardtahn(self):
+        module = HardTanh()
+        test_pass_stage = RunPasses([ConvertToClampPass])
+        (
+            ArmTester(
+                module,
+                example_inputs=module.get_inputs(),
+                compile_spec=common.get_tosa_compile_spec("TOSA-0.80+MI"),
+            )
+            .export()
+            .to_edge()
+            .check(["executorch_exir_dialects_edge__ops_aten_hardtanh_default"])
+            .run_passes(test_pass_stage)
+            .check(["executorch_exir_dialects_edge__ops_aten_clamp_default"])
+            .check_not(["executorch_exir_dialects_edge__ops_aten_hardtanh_default"])
+        )
+
+    def test_tosa_MI_relu(self):
+        module = ReLU()
+        test_pass_stage = RunPasses([ConvertToClampPass])
+        (
+            ArmTester(
+                module,
+                example_inputs=module.get_inputs(),
+                compile_spec=common.get_tosa_compile_spec("TOSA-0.80+MI"),
+            )
+            .export()
+            .to_edge()
+            .check(["executorch_exir_dialects_edge__ops_aten_relu_default"])
+            .run_passes(test_pass_stage)
+            .check(["executorch_exir_dialects_edge__ops_aten_clamp_default"])
+            .check_not(["executorch_exir_dialects_edge__ops_aten_relu_default"])
+        )
diff --git a/backends/arm/test/runner_utils.py b/backends/arm/test/runner_utils.py
index 65be0b88f7..2d182b4a41 100644
--- a/backends/arm/test/runner_utils.py
+++ b/backends/arm/test/runner_utils.py
@@ -525,7 +525,7 @@ def corstone320_installed() -> bool:
 
 def get_elf_path(target_board):
     elf_path = os.path.join(
-        "cmake-out",
+        "arm_test",
         f"arm_semihosting_executor_runner_{target_board}",
         "arm_executor_runner",
     )
diff --git a/backends/arm/test/setup_testing.sh b/backends/arm/test/setup_testing.sh
index ebf9d79967..b9f8fc454e 100755
--- a/backends/arm/test/setup_testing.sh
+++ b/backends/arm/test/setup_testing.sh
@@ -12,8 +12,8 @@ et_root_dir=$(cd ${script_dir}/../../.. && pwd)
 ethos_u_root_dir=${et_root_dir}/examples/arm/ethos-u-scratch/ethos-u
 
 toolchain_cmake=${et_root_dir}/examples/arm/ethos-u-setup/arm-none-eabi-gcc.cmake
-et_build_dir=${et_root_dir}/cmake-out
-build_root_test_dir=${et_build_dir}/arm_semihosting_executor_runner
+et_build_dir=${et_root_dir}/arm_test/cmake-out
+build_root_test_dir=${et_root_dir}/arm_test/arm_semihosting_executor_runner
 
 # Build Arm Baremetal executor_runner in semihosting mode.
 # Put in backends/arm/test/res to be used by unit tests.
@@ -38,12 +38,12 @@ function build_semihosting_executorch_runner() {
           -DTARGET_CPU=${target_cpu}                         \
           -DSEMIHOSTING=ON                                   \
           -DCMAKE_RUNTIME_OUTPUT_DIRECTORY=${build_test_dir} \
-          -B ${build_test_dir}                               \
           -DETHOS_SDK_PATH:PATH=${ethos_u_root_dir}          \
           -DET_DIR_PATH:PATH=${et_root_dir}                  \
           -DET_BUILD_DIR_PATH:PATH=${et_build_dir}           \
           -DPYTHON_EXECUTABLE=$(which python3)               \
-          -DSYSTEM_CONFIG=${system_config}
+          -DSYSTEM_CONFIG=${system_config}                   \
+          -B ${build_test_dir}
     echo "[${FUNCNAME[0]}] Configured CMAKE"
 
     n=$(nproc)
diff --git a/backends/arm/test/test_arm_baremetal.sh b/backends/arm/test/test_arm_baremetal.sh
index 9f2fa4c17d..6c2784501b 100755
--- a/backends/arm/test/test_arm_baremetal.sh
+++ b/backends/arm/test/test_arm_baremetal.sh
@@ -17,46 +17,47 @@ pwd
 TEST_SUITE=$1
 
 help() {
-  echo "Usage:"
-  echo " $0 <TESTNAME>"
-  echo " where <TESTNAME> can be any of:"
-  # This will list all lines in this file that is starting with test_ remove () { and print it as a list.
-  # e,g, "test_pytest() { # Test ops and other things" -> test_pytest # Test ops and other things
-  echo "all # run all tests"
-  grep "^test_" $0 | sed 's/([^)]*)[[:space:]]*{*//g'
-  exit
+    echo "Usage:"
+    echo " $0 <TESTNAME>"
+    echo " where <TESTNAME> can be any of:"
+    # This will list all lines in this file that is starting with test_ remove () { and print it as a list.
+    # e,g, "test_pytest() { # Test ops and other things" -> test_pytest # Test ops and other things
+    echo "all # run all tests"
+    grep "^test_" $0 | sed 's/([^)]*)[[:space:]]*{*//g'
+    exit
 }
 
 if [[ -z "${TEST_SUITE:-}" ]]; then
-  echo "Missing test suite name, exiting..."
-  help
+    echo "Missing test suite name, exiting..."
+    help
 else
-  echo "Run Arm baremetal test suite ${TEST_SUITE}"
+    echo "Run Arm baremetal test suite ${TEST_SUITE}"
 fi
 
 TEST_SUITE_NAME="$(basename "$0") ${TEST_SUITE}"
 
 all() { # Run all tests
-  # This will list all lines in this file that is starting with test_ remove () { and add this script name in 
-  # front of it and execute it in a sub shell
-  # e.g. from this file:
-  #
-  # test_pytest() { # Test ops and other things
-  #  bla bla bla
-  # }
-  # test_pytest_ethosu_fvp() { # Same as test_pytest but ...
-  #  bla bla bla
-  # }
-  #...
-  # become a small script:
-  # ----
-  # backends/arm/test/test_arm_baremetal.sh test_pytest # Test ops and other things
-  # backends/arm/test/test_arm_baremetal.sh test_pytest_ethosu_fvp # Same as test_pytest but ...
-  # ...
-  # ----
-  # That is executed
-  echo "${TEST_SUITE_NAME}: Run all tests"
-  grep "^test_" backends/arm/test/test_arm_baremetal.sh | sed 's/([^)]*)[[:space:]]*{*//g' | sed "s|^|$0 |" | sh
+    # This will list all lines in this file that is starting with test_ remove () { and add this script name in 
+    # front of it and execute it in a sub shell
+    # e.g. from this file:
+    #
+    # test_pytest() { # Test ops and other things
+    #  bla bla bla
+    # }
+    # test_pytest_ethosu_fvp() { # Same as test_pytest but ...
+    #  bla bla bla
+    # }
+    #...
+    # become a small script:
+    # ----
+    # backends/arm/test/test_arm_baremetal.sh test_pytest # Test ops and other things
+    # backends/arm/test/test_arm_baremetal.sh test_pytest_ethosu_fvp # Same as test_pytest but ...
+    # ...
+    # ----
+    # That is executed
+    echo "${TEST_SUITE_NAME}: Run all tests"
+    grep "^test_" backends/arm/test/test_arm_baremetal.sh | sed 's/([^)]*)[[:space:]]*{*//g' | sed "s|^|$0 |" | sh
+    echo "${TEST_SUITE_NAME}: PASS"
 }
 
 test_pytest() { # Test ops and other things
@@ -67,6 +68,7 @@ test_pytest() { # Test ops and other things
 
     # Run arm baremetal pytest tests without FVP
     pytest  --verbose --color=yes --numprocesses=auto backends/arm/test/
+    echo "${TEST_SUITE_NAME}: PASS"
 }
 
 test_pytest_ethosu_fvp() { # Same as test_pytest but also sometime verify using Corstone FVP
@@ -80,28 +82,68 @@ test_pytest_ethosu_fvp() { # Same as test_pytest but also sometime verify using
 
     # Run arm baremetal pytest tests with FVP
     pytest  --verbose --color=yes --numprocesses=auto backends/arm/test/ --arm_run_corstoneFVP
+    echo "${TEST_SUITE_NAME}: PASS"
 }
 
-test_run_ethosu_fvp() { # End to End model tests
+test_run_ethosu_fvp() { # End to End model tests using run.sh
     echo "${TEST_SUITE_NAME}: Test ethos-u delegate examples with run.sh"
 
     source examples/arm/ethos-u-scratch/setup_path.sh
 
     # TOSA quantized
     echo "${TEST_SUITE_NAME}: Test ethos-u target TOSA"
-    examples/arm/run.sh --target=TOSA --model_name=mv2
-    examples/arm/run.sh --target=TOSA --model_name=lstm
-    examples/arm/run.sh --target=TOSA --model_name=edsr
+    examples/arm/run.sh --target=TOSA --model_name=add
+    examples/arm/run.sh --target=TOSA --model_name=mul
 
     # Ethos-U55
     echo "${TEST_SUITE_NAME}: Test ethos-u target Ethos-U55"
-    examples/arm/run.sh --target=ethos-u55-128 --model_name=mv2
-    examples/arm/run.sh --target=ethos-u55-128 --model_name=lstm
+    examples/arm/run.sh --target=ethos-u55-128 --model_name=add
+    examples/arm/run.sh --target=ethos-u55-128 --model_name=mul
 
     # Ethos-U85
     echo "${TEST_SUITE_NAME}: Test ethos-u target Ethos-U85"
-    examples/arm/run.sh --target=ethos-u85-128 --model_name=mv2
-    examples/arm/run.sh --target=ethos-u85-128 --model_name=lstm
+    examples/arm/run.sh --target=ethos-u85-128 --model_name=add
+    examples/arm/run.sh --target=ethos-u85-128 --model_name=mul
+    echo "${TEST_SUITE_NAME}: PASS"
     }
 
+test_models_ethosu_fvp() { # End to End model tests using model_test.py 
+    echo "${TEST_SUITE_NAME}: Test ethos-u delegate models with test_model.py"
+
+    source examples/arm/ethos-u-scratch/setup_path.sh
+
+    # Build common libs once
+    python3 backends/arm/test/test_model.py --build_libs
+
+    # TOSA quantized
+    echo "${TEST_SUITE_NAME}: Test ethos-u target TOSA"
+    python3 backends/arm/test/test_model.py --target=TOSA --model=mv2
+    python3 backends/arm/test/test_model.py --target=TOSA --model=mv3
+    python3 backends/arm/test/test_model.py --target=TOSA --model=lstm
+    python3 backends/arm/test/test_model.py --target=TOSA --model=edsr
+
+    # Ethos-U55
+    echo "${TEST_SUITE_NAME}: Test ethos-u target Ethos-U55"
+    python3 backends/arm/test/test_model.py --target=ethos-u55-128 --model=mv2
+    python3 backends/arm/test/test_model.py --target=ethos-u55-64 --model=mv3
+    python3 backends/arm/test/test_model.py --target=ethos-u55-256 --model=lstm
+
+    # Ethos-U85
+    echo "${TEST_SUITE_NAME}: Test ethos-u target Ethos-U85"
+    python3 backends/arm/test/test_model.py --target=ethos-u85-256 --model=mv2
+    python3 backends/arm/test/test_model.py --target=ethos-u85-1024 --model=mv3
+    python3 backends/arm/test/test_model.py --target=ethos-u85-128 --model=lstm
+    echo "${TEST_SUITE_NAME}: PASS"
+    }
+
+test_full_ethosu_fvp() { # All End to End model tests
+    echo "${TEST_SUITE_NAME}: Test ethos-u delegate models and examples on fvp"
+
+    test_models_ethosu_fvp
+    test_run_ethosu_fvp
+    echo "${TEST_SUITE_NAME}: PASS"
+    }
+
+
+
 ${TEST_SUITE}
\ No newline at end of file
diff --git a/backends/arm/test/test_model.py b/backends/arm/test/test_model.py
new file mode 100755
index 0000000000..990b9e5f70
--- /dev/null
+++ b/backends/arm/test/test_model.py
@@ -0,0 +1,247 @@
+# Copyright 2025 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import argparse
+import os
+import platform
+import subprocess
+import sys
+
+
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--build_libs",
+        action="store_true",
+        required=False,
+        default=False,
+        help="Flag for building executorch libs needed for this testing",
+    )
+    parser.add_argument(
+        "--model",
+        required=False,
+        default=None,
+        help="Model to use that aot_arm_compiler.py can handle, can be a builtin, examples/models or a filename.",
+    )
+    parser.add_argument(
+        "--target",
+        required=False,
+        default=None,
+        help="Target name",
+    )
+    parser.add_argument(
+        "--test_output",
+        required=False,
+        default="arm_test",
+        help="Output folder used for build and test defults to arm_test",
+    )
+    parser.add_argument(
+        "--system_config",
+        required=False,
+        default=None,
+        help="Target specific system_config (See Vela compiler)",
+    )
+    parser.add_argument(
+        "--memory_mode",
+        required=False,
+        default=None,
+        help="Target specific memory_mode (See Vela compiler)",
+    )
+    parser.add_argument(
+        "--no_intermediate",
+        action="store_true",
+        required=False,
+        default=False,
+        help="Don't save temporary files during compilation",
+    )
+
+    args = parser.parse_args()
+
+    if args.model and "ethos-u" in args.target and args.system_config is None:
+        if "u55" in args.target:
+            args.system_config = "Ethos_U55_High_End_Embedded"
+        elif "u85" in args.target:
+            args.system_config = "Ethos_U85_SYS_DRAM_Mid"
+        else:
+            raise RuntimeError(f"Invalid target name {args.target}")
+
+    if args.model and "ethos-u" in args.target and args.memory_mode is None:
+        if "u55" in args.target:
+            args.memory_mode = "Shared_Sram"
+        elif "u85" in args.target:
+            args.memory_mode = "Sram_Only"
+        else:
+            raise RuntimeError(f"Invalid target name {args.target}")
+
+    return args
+
+
+def run_external_cmd(cmd: []):
+    print("CALL:", *cmd, sep=" ")
+    try:
+        subprocess.check_call(cmd)
+    except subprocess.CalledProcessError as err:
+        print("ERROR called: ", *cmd, sep=" ")
+        print(f"Failed with: {err.returncode}")
+        sys.exit(err.returncode)
+
+
+def build_libs(et_build_root: str, script_path: str):
+    run_external_cmd(
+        [
+            "bash",
+            os.path.join(script_path, "build_executorch.sh"),
+            f"--et_build_root={et_build_root}",
+            "--build_type=Release",
+        ]
+    )
+    run_external_cmd(
+        [
+            "bash",
+            os.path.join(script_path, "build_portable_kernels.sh"),
+            f"--et_build_root={et_build_root}",
+            "--build_type=Release",
+            "--portable_kernels=aten::_softmax.out",
+        ]
+    )
+    run_external_cmd(
+        [
+            "bash",
+            os.path.join(script_path, "build_quantized_ops_aot_lib.sh"),
+            f"--et_build_root={et_build_root}",
+            "--build_type=Release",
+        ]
+    )
+
+
+def build_pte(
+    et_build_root: str,
+    model_name: str,
+    target: str,
+    system_config: str,
+    memory_mode: str,
+    build_output: str,
+    no_intermediate: bool,
+):
+    soext = {"Darwin": "dylib", "Linux": "so", "Windows": "dll"}.get(
+        platform.system(), None
+    )
+    solibs_path = os.path.join(
+        et_build_root,
+        "cmake-out-aot-lib",
+        "kernels",
+        "quantized",
+        f"libquantized_ops_aot_lib.{soext}",
+    )
+    solibs = f"--so_library={solibs_path}"
+
+    intermediate = ""
+    if not no_intermediate:
+        intermediate = f"--intermediate={output}"
+
+    run_external_cmd(
+        [
+            "python3",
+            "-m",
+            "examples.arm.aot_arm_compiler",
+            "--delegate",
+            "--quantize",
+            intermediate,
+            f"--model_name={model_name}",
+            f"--target={target}",
+            f"--output={build_output}",
+            f"--system_config={system_config}",
+            f"--memory_mode={memory_mode}",
+            solibs,
+        ]
+    )
+
+    pte_file = os.path.join(output, f"{model_name}_arm_delegate_{args.target}.pte")
+    return pte_file
+
+
+def build_ethosu_runtime(
+    et_build_root: str,
+    script_path: str,
+    pte_file: str,
+    target: str,
+    system_config: str,
+    elf_build_path: str,
+):
+    run_external_cmd(
+        [
+            "bash",
+            os.path.join(script_path, "build_executorch_runner.sh"),
+            f"--et_build_root={et_build_root}",
+            f"--pte={pte_file}",
+            f"--target={target}",
+            "--build_type=Release",
+            f"--system_config={system_config}",
+            f"--output={elf_build_path}",
+        ]
+    )
+
+    elf_file = os.path.join(elf_build_path, "cmake-out", "arm_executor_runner")
+    return elf_file
+
+
+def run_elf_with_fvp(script_path: str, elf_file: str, target: str):
+    run_external_cmd(
+        [
+            "bash",
+            os.path.join(script_path, "run_fvp.sh"),
+            f"--elf={elf_file}",
+            f"--target={target}",
+        ]
+    )
+
+
+if __name__ == "__main__":
+
+    args = get_args()
+    script_path = os.path.join("backends", "arm", "scripts")
+
+    if args.build_libs:
+        build_libs(args.test_output, script_path)
+
+    if args.model:
+        model_name = args.model.split(" ")[0].split(";")[0]
+        if not model_name:
+            print("ERROR: Bad --model specified")
+        if not args.target:
+            print("ERROR: --model need --target to also be set")
+
+        output = os.path.join(
+            args.test_output, f"{model_name}_arm_delegate_{args.target}"
+        )
+
+        pte_file = build_pte(
+            args.test_output,
+            model_name,
+            args.target,
+            args.system_config,
+            args.memory_mode,
+            output,
+            args.no_intermediate,
+        )
+        print(f"PTE file created: {pte_file} ")
+
+        if "ethos-u" in args.target:
+            elf_build_path = os.path.join(
+                output, f"{model_name}_arm_delegate_{args.target}"
+            )
+
+            elf_file = build_ethosu_runtime(
+                args.test_output,
+                script_path,
+                pte_file,
+                args.target,
+                args.system_config,
+                elf_build_path,
+            )
+            print(f"ELF file created: {elf_file} ")
+
+            run_elf_with_fvp(script_path, elf_file, args.target)
+        print(f"Model: {model_name} on {args.target} -> PASS")
diff --git a/backends/cadence/CMakeLists.txt b/backends/cadence/CMakeLists.txt
index e2ac3de5ca..65dd543058 100644
--- a/backends/cadence/CMakeLists.txt
+++ b/backends/cadence/CMakeLists.txt
@@ -22,7 +22,10 @@ endif()
 include(${EXECUTORCH_ROOT}/build/Utils.cmake)
 
 # Let files say "include <executorch/path/to/header.h>".
-set(_common_include_directories ${EXECUTORCH_ROOT}/..)
+set(_common_include_directories ${EXECUTORCH_ROOT}/..
+  ${EXECUTORCH_ROOT}/runtime/core/portable_type/c10)
+
+add_compile_definitions(C10_USING_CUSTOM_GENERATED_MACROS)
 
 if(EXECUTORCH_CADENCE_CPU_RUNNER)
   include(${EXECUTORCH_ROOT}/build/Codegen.cmake)
@@ -74,10 +77,12 @@ endif()
 
 if(EXECUTORCH_NNLIB_OPT)
   set(TARGET_DIR hifi)
-  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/${TARGET_DIR}/third-party/nnlib)
+  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/${TARGET_DIR}/third-party/nnlib
+  ${EXECUTORCH_ROOT}/runtime/core/portable_type/c10)
 elseif(EXECUTORCH_FUSION_G3_OPT)
   set(TARGET_DIR fusion_g3)
-  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/${TARGET_DIR}/third-party/nnlib)
+  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/${TARGET_DIR}/third-party/nnlib
+  ${EXECUTORCH_ROOT}/runtime/core/portable_type/c10)
 else()
   set(TARGET_DIR reference)
 endif()
diff --git a/backends/cadence/aot/TARGETS b/backends/cadence/aot/TARGETS
index 0590e69460..2dd3c4dc49 100644
--- a/backends/cadence/aot/TARGETS
+++ b/backends/cadence/aot/TARGETS
@@ -180,6 +180,7 @@ python_library(
     typing = True,
     deps = [
         "//caffe2:torch",
+        ":ops_registrations",
         ":compiler_utils",
         "//executorch/backends/cadence/aot:pass_utils",
         "//executorch/backends/cadence/aot:utils",
@@ -255,6 +256,7 @@ python_library(
         "//executorch/backends/cadence/aot:pass_utils",
         "//executorch/backends/cadence/aot:remove_ops",
         "//executorch/backends/cadence/aot:utils",
+        "//executorch/backends/transforms:replace_scalar_with_tensor",
         "//executorch/exir:pass_base",
         "//executorch/exir/dialects:lib",
         "//executorch/exir/dialects/edge:lib",
diff --git a/backends/cadence/aot/functions_hifi.yaml b/backends/cadence/aot/functions_hifi.yaml
index 7a98d704d8..f74d2c7a32 100644
--- a/backends/cadence/aot/functions_hifi.yaml
+++ b/backends/cadence/aot/functions_hifi.yaml
@@ -219,11 +219,6 @@
     - arg_meta: null
       kernel_name: cadence::impl::HiFi::quantized_relu_per_tensor_out
 
-- func: cadence::quantized_linear.per_tensor_out(Tensor src, Tensor weight, Tensor bias, SymInt src_zero_point, SymInt weight_zero_point, SymInt out_multiplier, SymInt out_shift, SymInt out_zero_point, Tensor? offset, *, Tensor(a!) out) -> Tensor(a!)
-  kernels:
-    - arg_meta: null
-      kernel_name: cadence::impl::HiFi::quantized_linear_per_tensor_out
-
 - func: cadence::quantized_fully_connected.out(Tensor src, Tensor weight, Tensor bias, int src_zero_point, Tensor weight_zero_point, Tensor out_multiplier, Tensor out_shift, int out_zero_point, Tensor? offset, *, Tensor(a!) out) -> Tensor(a!)
   kernels:
     - arg_meta: null
diff --git a/backends/cadence/aot/fuse_ops.py b/backends/cadence/aot/fuse_ops.py
index aa79b5582a..f8a4b114e2 100644
--- a/backends/cadence/aot/fuse_ops.py
+++ b/backends/cadence/aot/fuse_ops.py
@@ -16,6 +16,9 @@
 from numbers import Number
 from typing import cast, Sequence
 
+# Import these for the cadence function signatures.
+import executorch.backends.cadence.aot.ops_registrations  # noqa: F401
+
 import torch
 import torch.fx
 from executorch.backends.cadence.aot.compiler_utils import (
@@ -849,7 +852,10 @@ def attempt_fusion(
             if isinstance(arg, torch.fx.Node)
             and isinstance(arg.target, EdgeOpOverload)
             and get_edge_overload_packet(arg.target)
-            == exir_ops.edge.quantized_decomposed.dequantize_per_tensor
+            in (
+                exir_ops.edge.quantized_decomposed.dequantize_per_tensor,
+                exir_ops.edge.cadence.dequantize_per_tensor,
+            )
         ]
         multiplier_nodes = [
             arg
diff --git a/backends/cadence/aot/memory_planning.py b/backends/cadence/aot/memory_planning.py
index 77ae7eb799..8c64fab61c 100644
--- a/backends/cadence/aot/memory_planning.py
+++ b/backends/cadence/aot/memory_planning.py
@@ -46,6 +46,7 @@ def get_aligned_offset(pre_aligned_offset: int, alignment: int) -> int:
 
 def collect_specs_from_graph_module(
     graph_module: torch.fx.GraphModule,
+    graph_signature: ExportGraphSignature,
     alloc_graph_input: bool,
     alloc_graph_output: bool,
 ) -> Iterable[TensorSpec]:
@@ -56,6 +57,7 @@ def collect_specs_from_graph_module(
     # Collect the specs from all the nodes in the graph module, and return it
     return collect_specs_from_nodes(
         graph_module.graph.nodes,
+        graph_signature,
         ignore_graph_input=not alloc_graph_input,
         ignore_graph_output=not alloc_graph_output,
     )
@@ -107,7 +109,7 @@ def memory_available(spec: TensorSpec) -> bool:
     # Iterate over all the specs in sorted order
     for spec in sorted(
         collect_specs_from_graph_module(
-            graph_module, alloc_graph_input, alloc_graph_output
+            graph_module, graph_signature, alloc_graph_input, alloc_graph_output
         ),
         key=lambda spec: spec.allocated_memory,
         reverse=True,
@@ -182,7 +184,7 @@ def greedy_by_size_for_offset_calculation_with_hierarchy(
     # Iterate over all the specs in sorted order
     for spec in sorted(
         collect_specs_from_graph_module(
-            graph_module, alloc_graph_input, alloc_graph_output
+            graph_module, graph_signature, alloc_graph_input, alloc_graph_output
         ),
         key=lambda spec: spec.allocated_memory,
         reverse=True,
@@ -250,6 +252,7 @@ def greedy_by_size_for_offset_calculation_with_hierarchy(
 
 def find_peak_memory_usages_per_memory(
     graph_module: torch.fx.GraphModule,
+    graph_signature: ExportGraphSignature,
     alloc_graph_input: bool,
     alloc_graph_output: bool,
     mem_constraints: Optional[MemConstraints] = None,
@@ -265,7 +268,7 @@ def find_peak_memory_usages_per_memory(
 
     # go through all nodes in the graph, collect memory usage per spec.mem_id
     for spec in collect_specs_from_graph_module(
-        graph_module, alloc_graph_input, alloc_graph_output
+        graph_module, graph_signature, alloc_graph_input, alloc_graph_output
     ):
         if mem_constraints is not None and mem_constraints.skipped_spec(spec):
             continue
@@ -288,6 +291,7 @@ def find_peak_memory_usages_per_memory(
 
 def find_peak_memory_usage(
     graph_module: torch.fx.GraphModule,
+    graph_signature: ExportGraphSignature,
     alloc_graph_input: bool,
     alloc_graph_output: bool,
     mem_constraints: Optional[MemConstraints] = None,
@@ -303,7 +307,7 @@ def find_peak_memory_usage(
 
     # Iterate over all the node specs
     for spec in collect_specs_from_graph_module(
-        graph_module, alloc_graph_input, alloc_graph_output
+        graph_module, graph_signature, alloc_graph_input, alloc_graph_output
     ):
         if spec.lifetime[0] is None or (
             mem_constraints is not None and mem_constraints.skipped_spec(spec)
@@ -358,6 +362,7 @@ def print_memory_planning_info(
     # Get the peak memory usages per memory space
     peak_memory_usages_per_memory = find_peak_memory_usages_per_memory(
         executorch_prog.exported_program().graph_module,
+        executorch_prog.exported_program().graph_signature,
         alloc_graph_input,
         alloc_graph_output,
         mem_constraints,
@@ -393,6 +398,7 @@ def print_memory_planning_info(
     # Get the total peak memory usage across all memory spaces
     total_peak_memory_usage = find_peak_memory_usage(
         executorch_prog.exported_program().graph_module,
+        executorch_prog.exported_program().graph_signature,
         alloc_graph_input,
         alloc_graph_output,
         mem_constraints,
@@ -453,7 +459,17 @@ def _init_mem_algos(self) -> None:
             greedy_by_size_for_offset_calculation_with_hierarchy,
         ]
 
-    def __call__(self, graph_module: torch.fx.GraphModule) -> PassResult:
+    def __call__(
+        self,
+        graph_module: torch.fx.GraphModule,
+    ) -> PassResult:
+        return self.run(graph_module)
+
+    def run(
+        self,
+        graph_module: torch.fx.GraphModule,
+        graph_signature: Optional[ExportGraphSignature] = None,
+    ) -> PassResult:
         mem_constraints = MemConstraints(
             opt_level=self.opt_level,
             alloc_graph_input=self.alloc_graph_input,
@@ -475,6 +491,6 @@ def __call__(self, graph_module: torch.fx.GraphModule) -> PassResult:
             alloc_graph_output=self.alloc_graph_output,
             alignment=self.mem_alignment,
         )
-        mem_planning(graph_module)
+        mem_planning.run(graph_module, graph_signature)
 
         return PassResult(graph_module, True)
diff --git a/backends/cadence/aot/ops_registrations.py b/backends/cadence/aot/ops_registrations.py
index a8dd131584..1e328cf4e5 100644
--- a/backends/cadence/aot/ops_registrations.py
+++ b/backends/cadence/aot/ops_registrations.py
@@ -99,6 +99,10 @@
     "quantized_add(Tensor X, Tensor X_scale, Tensor X_zero_point, Tensor Y, Tensor Y_scale, "
     "Tensor Y_zero_point, float out_scale, int out_zero_point) -> (Tensor Z)"
 )
+lib.define(
+    "quantized_add.per_tensor(Tensor X, float X_scale, int X_zero_point, Tensor Y, float Y_scale, "
+    "int Y_zero_point, float out_scale, int out_zero_point) -> (Tensor Z)"
+)
 lib.define(
     "quantized_mul(Tensor X, Tensor X_scale, Tensor X_zero_point, Tensor Y, Tensor Y_scale, "
     "Tensor Y_zero_point, float out_scale, int out_zero_point) -> (Tensor Z)"
@@ -175,6 +179,10 @@
     "quantized_add.out(Tensor X, Tensor X_scale, Tensor X_zero_point, Tensor Y, Tensor Y_scale, "
     "Tensor Y_zero_point, float out_scale, int out_zero_point, *, Tensor(a!) out) -> Tensor(a!)"
 )
+lib.define(
+    "quantized_add.per_tensor_out(Tensor X, float X_scale, int X_zero_point, Tensor Y, float Y_scale, "
+    "int Y_zero_point, float out_scale, int out_zero_point, *, Tensor(a!) out) -> Tensor(a!)"
+)
 lib.define(
     "quantized_mul.out(Tensor X, Tensor X_scale, Tensor X_zero_point, Tensor Y, Tensor Y_scale, "
     "Tensor Y_zero_point, float out_scale, int out_zero_point, *, Tensor(a!) out) -> Tensor(a!)"
@@ -290,6 +298,42 @@ def dequantize_per_tensor_meta(
     return input.new_empty(input.size(), dtype=torch.float)
 
 
+@register_fake("cadence::quantized_add")
+def quantized_add_meta(
+    X: torch.Tensor,
+    X_scale: torch.Tensor,
+    X_zero_point: torch.Tensor,
+    Y: torch.Tensor,
+    Y_scale: torch.Tensor,
+    Y_zero_point: torch.Tensor,
+    out_scale: float,
+    out_zero_point: int,
+) -> torch.Tensor:
+    out_size = X.size()
+    if list(X.size()) == [1]:
+        out_size = Y.size()
+
+    return X.new_empty(out_size, dtype=X.dtype)
+
+
+@register_fake("cadence::quantized_add.per_tensor")
+def quantized_add_per_tensor_meta(
+    X: torch.Tensor,
+    X_scale: float,
+    X_zero_point: int,
+    Y: torch.Tensor,
+    Y_scale: float,
+    Y_zero_point: int,
+    out_scale: float,
+    out_zero_point: int,
+) -> torch.Tensor:
+    out_size = X.size()
+    if list(X.size()) == [1]:
+        out_size = Y.size()
+
+    return X.new_empty(out_size, dtype=X.dtype)
+
+
 @register_fake("cadence::quantized_linear")
 def quantized_linear_meta(
     src: torch.Tensor,
diff --git a/backends/cadence/aot/pass_utils.py b/backends/cadence/aot/pass_utils.py
index d0166061c7..3d73e7f8c1 100644
--- a/backends/cadence/aot/pass_utils.py
+++ b/backends/cadence/aot/pass_utils.py
@@ -7,7 +7,7 @@
 # pyre-strict
 
 from dataclasses import dataclass
-from typing import Callable, List, Optional, Set, Union
+from typing import Callable, List, Optional, Set, Type, Union
 
 import torch
 from executorch.backends.cadence.aot.utils import get_edge_overload_packet
@@ -32,33 +32,33 @@ class CadencePassAttribute:
 
 
 # A dictionary that maps an ExportPass to its attributes.
-ALL_CADENCE_PASSES: dict[ExportPass, CadencePassAttribute] = {}
+ALL_CADENCE_PASSES: dict[Type[ExportPass], CadencePassAttribute] = {}
 
 
-def get_cadence_pass_attribute(p: ExportPass) -> CadencePassAttribute:
+def get_cadence_pass_attribute(p: Type[ExportPass]) -> CadencePassAttribute:
     return ALL_CADENCE_PASSES[p]
 
 
 # A decorator that registers a pass.
 def register_cadence_pass(
     pass_attribute: CadencePassAttribute,
-) -> Callable[[ExportPass], ExportPass]:
-    def wrapper(cls: ExportPass) -> ExportPass:
+) -> Callable[[Type[ExportPass]], Type[ExportPass]]:
+    def wrapper(cls: Type[ExportPass]) -> Type[ExportPass]:
         ALL_CADENCE_PASSES[cls] = pass_attribute
         return cls
 
     return wrapper
 
 
-def get_all_available_cadence_passes() -> Set[ExportPass]:
+def get_all_available_cadence_passes() -> Set[Type[ExportPass]]:
     return set(ALL_CADENCE_PASSES.keys())
 
 
 # Create a new filter to filter out relevant passes from all passes.
 def create_cadence_pass_filter(
     opt_level: int, debug: bool = False
-) -> Callable[[ExportPass], bool]:
-    def _filter(p: ExportPass) -> bool:
+) -> Callable[[Type[ExportPass]], bool]:
+    def _filter(p: Type[ExportPass]) -> bool:
         pass_attribute = get_cadence_pass_attribute(p)
         return (
             pass_attribute.opt_level is not None
diff --git a/backends/cadence/aot/passes.py b/backends/cadence/aot/passes.py
index ab23149e60..4e27f83c13 100644
--- a/backends/cadence/aot/passes.py
+++ b/backends/cadence/aot/passes.py
@@ -6,7 +6,7 @@
 
 # pyre-strict
 
-from typing import Any, List, Optional, Type
+from typing import Any, cast, List, Optional, Type
 
 import torch
 import torch.fx
@@ -95,9 +95,9 @@ def get_cadence_passes(
     passes = get_passes_in_default_order()
     pass_filter = create_cadence_pass_filter(opt_level)
     filtered_passes = [
-        # pyre-fixme[20]: Call `torch.fx.passes.infra.pass_base.PassBase.__call__` expects argument `graph_module`.
         filtered_pass()
         # pyre-fixme[6]: In call `filter.__new__` ... got `List[Type[typing.Callable[[GraphModule], Optional[PassResult]]]]`.
         for filtered_pass in list(filter(pass_filter, passes))
     ]
-    return filtered_passes
+    # The type checker can't infer the proper type of the list comprehension.
+    return cast(List[Optional[PassResult]], filtered_passes)
diff --git a/backends/cadence/aot/quantizer/fusion_pass.py b/backends/cadence/aot/quantizer/fusion_pass.py
index 7c05e9b867..51d019f155 100644
--- a/backends/cadence/aot/quantizer/fusion_pass.py
+++ b/backends/cadence/aot/quantizer/fusion_pass.py
@@ -11,6 +11,7 @@
 import torch
 from executorch.backends.cadence.aot.quantizer.patterns import (
     AddmmPattern,
+    AddPattern,
     BmmPattern,
     Conv1dPattern,
     Conv2dPattern,
@@ -41,6 +42,47 @@
 ReluPatterns = (ReluPattern0, ReluPattern1)
 
 
+def get_args_and_kwargs_add(
+    graph_module: GraphModule,
+    inputs_inputs: List[fx.Node],
+    dequants_inputs: List[fx.Node],
+    quant_node: fx.Node,
+) -> Tuple[Tuple[ArgsType, ...], Dict[str, ArgsType]]:
+    X_scale_ = graph_module.graph.call_function(
+        torch.ops.aten.full.default,
+        ([1], dequants_inputs[0].args[1]),
+        {"dtype": torch.float},
+    )
+    X_zero_point_ = graph_module.graph.call_function(
+        torch.ops.aten.full.default,
+        ([1], dequants_inputs[0].args[2]),
+        {"dtype": torch.int32},
+    )
+    Y_scale_ = graph_module.graph.call_function(
+        torch.ops.aten.full.default,
+        ([1], dequants_inputs[1].args[1]),
+        {"dtype": torch.float},
+    )
+    Y_zero_point_ = graph_module.graph.call_function(
+        torch.ops.aten.full.default,
+        ([1], dequants_inputs[1].args[2]),
+        {"dtype": torch.int32},
+    )
+    args = (
+        inputs_inputs[0],
+        X_scale_,
+        X_zero_point_,
+        inputs_inputs[1],
+        Y_scale_,
+        Y_zero_point_,
+        quant_node.args[1],
+        quant_node.args[2],
+    )
+
+    kwargs = {}
+    return args, kwargs
+
+
 # Helper function to get the args and kwargs for the linear replacement op
 def get_args_and_kwargs_linear(
     graph_module: GraphModule,
@@ -339,7 +381,7 @@ def call(self, graph_module: fx.GraphModule) -> PassResult:  # noqa: C901
             )
             for fused_partition in fused_partitions:
                 anchors = pattern.get_anchors(graph_module, fused_partition)
-                if not anchors:
+                if not anchors or anchors.empty:
                     continue
                 if any(self.is_fused(p.nodes) for p in fused_partition):
                     continue
@@ -385,7 +427,14 @@ def call(self, graph_module: fx.GraphModule) -> PassResult:  # noqa: C901
                         inputs_inputs + weights_inputs + other_inputs + bias_inputs
                     )
                     kwargs = {}
-                    if isinstance(pattern, (Conv1dPattern, Conv2dPattern)):
+                    if isinstance(pattern, AddPattern):
+                        args, kwargs = get_args_and_kwargs_add(
+                            graph_module,
+                            inputs_inputs,
+                            dequants_inputs,
+                            quant_node,
+                        )
+                    elif isinstance(pattern, (Conv1dPattern, Conv2dPattern)):
                         args, kwargs = get_args_and_kwargs_conv(
                             graph_module,
                             inputs_inputs,
diff --git a/backends/cadence/aot/quantizer/patterns.py b/backends/cadence/aot/quantizer/patterns.py
index 0dee8ebcd1..0e907812b1 100644
--- a/backends/cadence/aot/quantizer/patterns.py
+++ b/backends/cadence/aot/quantizer/patterns.py
@@ -43,6 +43,7 @@ class PartitionAnchors:
     output: List[Union[Tuple[fx.Node], Tuple[fx.Node, SharedQuantizationSpec]]] = field(
         default_factory=list
     )
+    empty: bool = False
 
 
 class QuantizationPattern(ABC):
@@ -101,6 +102,38 @@ def replacement_op(self) -> OpOverload:
         return torch.ops.cadence.quantized_linear
 
 
+class AddPattern(QuantizationPattern):
+    def partition_types(self) -> List[OpOverload]:
+        return [torch.ops.aten.add.Tensor]
+
+    def get_anchors(
+        self, gm: fx.GraphModule, fused_partition: List[fx.GraphModule]
+    ) -> PartitionAnchors:
+        # pyre-fixme[29]: `Union[BoundMethod[typing.Callable(torch._C.TensorBase.__ge...
+        add_node = fused_partition[0].nodes[-1]
+
+        # Bail if:
+        #   - the add node is not a tensor add
+        #   - the add node has kwargs (e.g. alpha)
+        is_tensor_add = isinstance(add_node.args[0], fx.Node) and isinstance(
+            add_node.args[1], fx.Node
+        )
+        if not is_tensor_add or len(add_node.kwargs) > 0:
+            return PartitionAnchors(
+                empty=True,
+            )
+
+        return PartitionAnchors(
+            inputs=[(add_node, 0), (add_node, 1)],
+            weights=[],
+            biases=[],
+            output=[(add_node,)],
+        )
+
+    def replacement_op(self) -> OpOverload:
+        return torch.ops.cadence.quantized_add.default
+
+
 class BmmPattern(QuantizationPattern):
     def partition_types(self) -> List[OpOverload]:
         return [torch.ops.aten.bmm.default]
diff --git a/backends/cadence/aot/quantizer/quantizer.py b/backends/cadence/aot/quantizer/quantizer.py
index d6765d2ad3..42cc1a1df1 100644
--- a/backends/cadence/aot/quantizer/quantizer.py
+++ b/backends/cadence/aot/quantizer/quantizer.py
@@ -6,11 +6,13 @@
 
 # pyre-strict
 
+from dataclasses import dataclass
 from typing import List, Optional, Tuple, Union
 
 import torch
 from executorch.backends.cadence.aot.quantizer.patterns import (
     AddmmPattern,
+    AddPattern,
     BmmPattern,
     Conv1dPattern,
     Conv2dPattern,
@@ -108,7 +110,7 @@ def annotate(self, model: torch.fx.GraphModule) -> torch.fx.GraphModule:
                 continue
 
             anchors = self.pattern.get_anchors(model, fused_partition)
-            if not anchors:
+            if not anchors or anchors.empty:
                 continue
             if is_annotated(
                 [
@@ -177,6 +179,8 @@ def get_cadence_default_quantizers() -> List[Quantizer]:
     ]
 
 
+# Note: need dataclass to be used in CI configs through OmegaConf and Hydra
+@dataclass
 class CadenceQuantizer(ComposableQuantizer):
     """
     Generic CadenceQuantizer. Although it can be used directly, it is typically a base
@@ -208,3 +212,15 @@ def __init__(
         self,
     ) -> None:
         super().__init__([])
+
+
+class CadenceWakeWordQuantizer(CadenceQuantizer):
+    """
+    Quantizer for WakeWord, including add
+    """
+
+    def __init__(self, quantizers: Optional[list[Quantizer]] = None) -> None:
+        if quantizers is None:
+            quantizers = get_cadence_default_quantizers()
+        quantizers.append(CadenceAtenQuantizer(AddPattern(), qconfig_A8uW8u))
+        super().__init__(quantizers)
diff --git a/backends/cadence/aot/remove_ops.py b/backends/cadence/aot/remove_ops.py
index caceabfba8..942f6d5553 100644
--- a/backends/cadence/aot/remove_ops.py
+++ b/backends/cadence/aot/remove_ops.py
@@ -569,6 +569,8 @@ class Subgraph:
         exir_ops.edge.aten.hardtanh.default,
         exir_ops.edge.quantized_decomposed.quantize_per_tensor.default,
         exir_ops.edge.quantized_decomposed.dequantize_per_tensor.default,
+        exir_ops.edge.cadence.quantize_per_tensor.default,
+        exir_ops.edge.cadence.dequantize_per_tensor.default,
     }
 
     # must be initialized in the constructor
diff --git a/backends/cadence/aot/reorder_ops.py b/backends/cadence/aot/reorder_ops.py
index 0fd7f0b61a..e8a8e23053 100644
--- a/backends/cadence/aot/reorder_ops.py
+++ b/backends/cadence/aot/reorder_ops.py
@@ -118,6 +118,8 @@ def get_descendent_quant_ops(self, node: torch.fx.Node) -> List[torch.fx.Node]:
             if user_target in {
                 torch.ops.quantized_decomposed.quantize_per_tensor,
                 exir_ops.edge.quantized_decomposed.quantize_per_tensor,
+                torch.ops.cadence.quantize_per_tensor,
+                exir_ops.edge.cadence.quantize_per_tensor,
             }:
                 descendent_quant_ops.append(user)
             # If the successor is a trivially quantizable op, consider its users
@@ -300,6 +302,8 @@ def advance_quantize_op(self, graph_module: torch.fx.GraphModule):
             if get_overload_packet(node.target) not in (
                 exir_ops.edge.quantized_decomposed.quantize_per_tensor,
                 torch.ops.quantized_decomposed.quantize_per_tensor,
+                exir_ops.edge.cadence.quantize_per_tensor,
+                torch.ops.cadence.quantize_per_tensor,
             ):
                 continue
 
@@ -413,6 +417,7 @@ def postponing_feasible(self, dequant_node: torch.fx.Node):
             in {
                 exir_ops.edge.quantized_decomposed.quantize_per_tensor,
                 exir_ops.edge.quantized_decomposed.quantize_per_channel,
+                exir_ops.edge.cadence.quantize_per_tensor,
             }
             for x in users
         )
@@ -422,6 +427,7 @@ def postpone_dequantize_op(self, graph_module: torch.fx.GraphModule) -> bool:
         packet_to_overload_map = {
             exir_ops.edge.quantized_decomposed.dequantize_per_tensor: "default",
             exir_ops.edge.quantized_decomposed.dequantize_per_channel: "default",
+            exir_ops.edge.cadence.dequantize_per_tensor: "default",
         }
         graph = graph_module.graph
         modified = False
@@ -500,6 +506,7 @@ class SinkOpsCloserToUsePass(ExportPass):
         exir_ops.edge.aten.dequantize,
         exir_ops.edge.quantized_decomposed.dequantize_per_tensor,
         exir_ops.edge.quantized_decomposed.dequantize_per_channel,
+        exir_ops.edge.cadence.dequantize_per_tensor,
     }
 
     def sink_ops_closer_to_use(self, graph_module: torch.fx.GraphModule):
@@ -558,6 +565,7 @@ class HoistOpsCloserToDefPass(ExportPass):
 
     hoistable_ops: Set[EdgeOpOverload] = {
         exir_ops.edge.quantized_decomposed.quantize_per_tensor,
+        exir_ops.edge.cadence.quantize_per_tensor,
         exir_ops.edge.aten.slice_copy,
         exir_ops.edge.aten.select_copy,
     }
diff --git a/backends/cadence/aot/replace_ops.py b/backends/cadence/aot/replace_ops.py
index 487d374fb8..f91fb26ddc 100644
--- a/backends/cadence/aot/replace_ops.py
+++ b/backends/cadence/aot/replace_ops.py
@@ -162,11 +162,12 @@ def call_operator(
         kwargs: Dict[str, Argument],
         meta: NodeMetadata,
     ) -> ProxyValue:
-        if op not in {exir_ops.edge.quantized_decomposed.quantize_per_tensor.default}:
+        ns = exir_ops.edge if isinstance(op, EdgeOpOverload) else torch.ops
+        if op != ns.quantized_decomposed.quantize_per_tensor.default:
             return super().call_operator(op, args, kwargs, meta)
 
         return super().call_operator(
-            exir_ops.edge.cadence.quantize_per_tensor.default,
+            ns.cadence.quantize_per_tensor.default,
             args,
             kwargs,
             meta,
@@ -188,11 +189,12 @@ def call_operator(
         kwargs: Dict[str, Argument],
         meta: NodeMetadata,
     ) -> ProxyValue:
-        if op not in {exir_ops.edge.quantized_decomposed.dequantize_per_tensor.default}:
+        ns = exir_ops.edge if isinstance(op, EdgeOpOverload) else torch.ops
+        if op != ns.quantized_decomposed.dequantize_per_tensor.default:
             return super().call_operator(op, args, kwargs, meta)
 
         return super().call_operator(
-            exir_ops.edge.cadence.dequantize_per_tensor.default,
+            ns.cadence.dequantize_per_tensor.default,
             args,
             kwargs,
             meta,
@@ -1717,9 +1719,9 @@ def call_operator(self, op, args, kwargs, meta):
         )
 
 
-@register_cadence_pass(CadencePassAttribute(opt_level=0))(
-    ReplaceScalarWithTensorArgPass()
-)
+register_cadence_pass(CadencePassAttribute(opt_level=0))(ReplaceScalarWithTensorArgPass)
+
+
 @register_cadence_pass(CadencePassAttribute(opt_level=0))
 class ReplaceScalarTensorWithFullPass(ExportPass):
     """
@@ -1837,6 +1839,10 @@ class ReplaceSingleElementTensorArgumentsFromFullOpWithScalarPass(ExportPass):
     replaced_scalar_args: dict[
         EdgeOpOverloadPacket, tuple[EdgeOpOverload, Sequence[int]]
     ] = {
+        exir_ops.edge.cadence.quantized_add: (
+            exir_ops.edge.cadence.quantized_add.per_tensor,
+            [1, 2, 4, 5],
+        ),
         exir_ops.edge.cadence.quantized_conv: (
             exir_ops.edge.cadence.quantized_conv.per_tensor,
             [8, 9, 12, 13],
diff --git a/backends/cadence/aot/tests/test_memory_passes.py b/backends/cadence/aot/tests/test_memory_passes.py
index d50456796c..1844a3b4d8 100644
--- a/backends/cadence/aot/tests/test_memory_passes.py
+++ b/backends/cadence/aot/tests/test_memory_passes.py
@@ -46,14 +46,13 @@ def calculate_aligned_num_bytes(num: int, alignment: int = 16) -> int:
         inputs = (torch.ones(batch_size, input_dim),)
         model = PeakMemoryTestModel(input_dim, hidden_dim, output_dim)
 
-        graph_module = (
-            compiler.export_to_executorch_gen_etrecord(model, inputs)
-            .exported_program()
-            .graph_module
-        )
+        exported_program = compiler.export_to_executorch_gen_etrecord(
+            model, inputs
+        ).exported_program()
 
         peak_usage, _ = find_peak_memory_usage(
-            graph_module,
+            exported_program.graph_module,
+            exported_program.graph_signature,
             mem_constraints=None,
             alloc_graph_input=True,
             alloc_graph_output=True,
@@ -73,14 +72,13 @@ def calculate_aligned_num_bytes(num: int, alignment: int = 16) -> int:
             input_dim, hidden_dim, hidden_dim, hidden_dim, output_dim
         )
 
-        graph_module = (
-            compiler.export_to_executorch_gen_etrecord(model, inputs)
-            .exported_program()
-            .graph_module
-        )
+        exported_program = compiler.export_to_executorch_gen_etrecord(
+            model, inputs
+        ).exported_program()
 
         peak_usage, _ = find_peak_memory_usage(
-            graph_module,
+            exported_program.graph_module,
+            exported_program.graph_signature,
             mem_constraints=None,
             alloc_graph_input=True,
             alloc_graph_output=True,
@@ -111,6 +109,7 @@ def forward(self, x):
         graph_module.graph.eliminate_dead_code()
         peak_usage, _ = find_peak_memory_usage(
             graph_module,
+            executorch_prog.exported_program().graph_signature,
             alloc_graph_input=False,
             alloc_graph_output=False,
             mem_constraints=None,
diff --git a/backends/cadence/fusion_g3/operators/CMakeLists.txt b/backends/cadence/fusion_g3/operators/CMakeLists.txt
index cac16bddc5..f39614ee4f 100644
--- a/backends/cadence/fusion_g3/operators/CMakeLists.txt
+++ b/backends/cadence/fusion_g3/operators/CMakeLists.txt
@@ -64,7 +64,8 @@ target_link_libraries(aten_ops_cadence PUBLIC executorch)
 target_link_libraries(aten_ops_cadence PRIVATE xa_nnlib)
 
 # Let files say "include <executorch/path/to/header.h>".
-set(_common_include_directories ${EXECUTORCH_ROOT}/..)
+set(_common_include_directories ${EXECUTORCH_ROOT}/..
+${EXECUTORCH_ROOT}/runtime/core/portable_type/c10)
 
 target_include_directories(
   aten_ops_cadence PUBLIC ${ROOT_DIR}/.. ${CMAKE_BINARY_DIR}
diff --git a/backends/cadence/hifi/kernels/CMakeLists.txt b/backends/cadence/hifi/kernels/CMakeLists.txt
index 9bbd386c75..270835dbb7 100644
--- a/backends/cadence/hifi/kernels/CMakeLists.txt
+++ b/backends/cadence/hifi/kernels/CMakeLists.txt
@@ -25,7 +25,8 @@ add_library(
   ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_transpose_32.c
 )
 # Let files say "include <executorch/path/to/header.h>".
-set(_common_include_directories ${EXECUTORCH_ROOT}/..)
+set(_common_include_directories ${EXECUTORCH_ROOT}/..
+${EXECUTORCH_ROOT}/runtime/core/portable_type/c10)
 
 target_include_directories(
   cadence_kernels
diff --git a/backends/cadence/hifi/operators/CMakeLists.txt b/backends/cadence/hifi/operators/CMakeLists.txt
index d6820c0700..86b85bbfb6 100644
--- a/backends/cadence/hifi/operators/CMakeLists.txt
+++ b/backends/cadence/hifi/operators/CMakeLists.txt
@@ -67,7 +67,8 @@ target_link_libraries(aten_ops_cadence PUBLIC executorch)
 target_link_libraries(aten_ops_cadence PRIVATE cadence_kernels)
 
 # Let files say "include <executorch/path/to/header.h>".
-set(_common_include_directories ${EXECUTORCH_ROOT}/..)
+set(_common_include_directories ${EXECUTORCH_ROOT}/..
+${EXECUTORCH_ROOT}/runtime/core/portable_type/c10)
 
 target_include_directories(
   aten_ops_cadence PUBLIC ${ROOT_DIR}/.. ${CMAKE_BINARY_DIR}
@@ -77,7 +78,7 @@ target_include_directories(
 # Custom ops that are needed to run the test model.
 add_library(
   custom_ops "op_quantized_linear_out.cpp" "op_quantized_layer_norm.cpp"
-             "op_quantize_per_tensor.cpp" "op_quantized_relu_out.cpp" "op_dequantize_per_tensor.cpp"
+             "op_quantize_per_tensor.cpp" "op_quantized_relu_out.cpp" "op_dequantize_per_tensor.cpp" "op_quantized_fully_connected_out"
 )
 target_include_directories(
   custom_ops PUBLIC ${ROOT_DIR}/.. ${CMAKE_BINARY_DIR}
diff --git a/backends/cadence/hifi/operators/op_clamp.cpp b/backends/cadence/hifi/operators/op_clamp.cpp
index 05c8659cbc..785e6f015d 100644
--- a/backends/cadence/hifi/operators/op_clamp.cpp
+++ b/backends/cadence/hifi/operators/op_clamp.cpp
@@ -328,7 +328,7 @@ Tensor& clamp_tensor_out(
     const executorch::aten::optional<Tensor>& min_opt,
     const executorch::aten::optional<Tensor>& max_opt,
     Tensor& out) {
-  clamp_Tensor_out(ctx, in, min_opt, max_opt, out);
+  return clamp_Tensor_out(ctx, in, min_opt, max_opt, out);
 }
 
 } // namespace native
diff --git a/backends/cadence/hifi/operators/op_mean.cpp b/backends/cadence/hifi/operators/op_mean.cpp
index 59cf858158..4b93e55047 100644
--- a/backends/cadence/hifi/operators/op_mean.cpp
+++ b/backends/cadence/hifi/operators/op_mean.cpp
@@ -175,7 +175,7 @@ Tensor& mean_dim_out(
     bool keepdim,
     optional<ScalarType> dtype,
     Tensor& out) {
-  mean_out(ctx, in, dim_list, keepdim, dtype, out);
+  return mean_out(ctx, in, dim_list, keepdim, dtype, out);
 }
 
 } // namespace native
diff --git a/backends/cadence/hifi/operators/op_quantized_relu_out.cpp b/backends/cadence/hifi/operators/op_quantized_relu_out.cpp
index b8baa946b9..9b65751da7 100644
--- a/backends/cadence/hifi/operators/op_quantized_relu_out.cpp
+++ b/backends/cadence/hifi/operators/op_quantized_relu_out.cpp
@@ -100,10 +100,10 @@ void quantized_relu_per_tensor_out(
 void quantized_relu_out(
     KernelRuntimeContext& ctx,
     const Tensor& input,
-    const int64_t in_zero_point,
+    const Tensor& in_zero_point,
     const int64_t out_zero_point,
-    const int64_t out_multiplier,
-    const int64_t out_shift,
+    const Tensor& out_multiplier,
+    const Tensor& out_shift,
     Tensor& output) {
   quantized_relu_per_tensor_out(
       ctx,
diff --git a/backends/cadence/hifi/operators/op_softmax.cpp b/backends/cadence/hifi/operators/op_softmax.cpp
index 852479ed93..25d3ad7d38 100644
--- a/backends/cadence/hifi/operators/op_softmax.cpp
+++ b/backends/cadence/hifi/operators/op_softmax.cpp
@@ -200,7 +200,7 @@ Tensor& softmax_out(
     int64_t dim,
     bool half_to_float,
     Tensor& out) {
-  _softmax_out(ctx, in, dim, half_to_float, out);
+  return _softmax_out(ctx, in, dim, half_to_float, out);
 }
 
 } // namespace native
diff --git a/backends/cadence/hifi/operators/op_where.cpp b/backends/cadence/hifi/operators/op_where.cpp
index ac7559691a..94c1684fe0 100644
--- a/backends/cadence/hifi/operators/op_where.cpp
+++ b/backends/cadence/hifi/operators/op_where.cpp
@@ -183,6 +183,15 @@ Tensor& where_self_out(
   return out;
 }
 
+Tensor& where_out(
+    RuntimeContext& ctx,
+    const Tensor& cond,
+    const Tensor& a,
+    const Tensor& b,
+    Tensor& out) {
+  return where_out(ctx, cond, a, b, out);
+}
+
 } // namespace native
 } // namespace HiFi
 } // namespace impl
diff --git a/backends/cadence/reference/kernels/CMakeLists.txt b/backends/cadence/reference/kernels/CMakeLists.txt
index 07394cbe83..3fe0fe2101 100644
--- a/backends/cadence/reference/kernels/CMakeLists.txt
+++ b/backends/cadence/reference/kernels/CMakeLists.txt
@@ -8,7 +8,8 @@
 add_library(cadence_kernels kernels.cpp)
 
 # Let files say "include <executorch/path/to/header.h>".
-set(_common_include_directories ${EXECUTORCH_ROOT}/..)
+set(_common_include_directories ${EXECUTORCH_ROOT}/..
+${EXECUTORCH_ROOT}/runtime/core/portable_type/c10)
 
 target_include_directories(cadence_kernels PUBLIC .
                     ${_common_include_directories}
diff --git a/backends/cadence/reference/operators/CMakeLists.txt b/backends/cadence/reference/operators/CMakeLists.txt
index a2d51af2c0..ce926d8601 100644
--- a/backends/cadence/reference/operators/CMakeLists.txt
+++ b/backends/cadence/reference/operators/CMakeLists.txt
@@ -71,7 +71,8 @@ target_link_libraries(aten_ops_cadence PUBLIC executorch)
 target_link_libraries(aten_ops_cadence PRIVATE cadence_kernels)
 
 # Let files say "include <executorch/path/to/header.h>".
-set(_common_include_directories ${EXECUTORCH_ROOT}/..)
+set(_common_include_directories ${EXECUTORCH_ROOT}/..
+${EXECUTORCH_ROOT}/runtime/core/portable_type/c10)
 
 target_include_directories(
   aten_ops_cadence PUBLIC ${ROOT_DIR}/.. ${CMAKE_BINARY_DIR}
diff --git a/backends/qualcomm/tests/test_qnn_delegate.py b/backends/qualcomm/tests/test_qnn_delegate.py
index 6ea94ba9e0..9b05ad871f 100644
--- a/backends/qualcomm/tests/test_qnn_delegate.py
+++ b/backends/qualcomm/tests/test_qnn_delegate.py
@@ -3106,6 +3106,173 @@ def test_qnn_backend_draw_graph(self):
         ), "Generated .dot file does not match the golden file."
 
 
+class TestExampleLLMScript(TestQNN):
+    def required_envs(self, conditions=None) -> bool:
+        conditions = [] if conditions is None else conditions
+        return all(
+            [
+                self.executorch_root,
+                self.artifact_dir,
+                *conditions,
+            ]
+        )
+
+    def test_llama3_2_1b(self):
+        if not self.required_envs():
+            self.skipTest("missing required envs")
+        assert (
+            self.llama_artifacts is not None
+        ), "Please provide path to llama artifacts"
+
+        prompt = "What is the meaning of life?"
+        cmds = [
+            "python",
+            f"{self.executorch_root}/examples/qualcomm/oss_scripts/llama/llama.py",
+            "--artifact",
+            self.artifact_dir,
+            "--build_folder",
+            self.build_folder,
+            "--model",
+            self.model,
+            "--checkpoint",
+            f"{self.llama_artifacts}/consolidated.00.pth",
+            "--params",
+            f"{self.llama_artifacts}/params.json",
+            "--tokenizer_model",
+            f"{self.llama_artifacts}/tokenizer.model",
+            "--ip",
+            self.ip,
+            "--port",
+            str(self.port),
+            "--prompt",
+            f"{prompt}",
+            "--ptq",
+            "16a4w",
+            "--temperature",
+            "0",
+            "--llama_model",
+            "llama3_2",
+            "--model_mode",
+            "hybrid",
+            "--prefill_seq_len",
+            "32",
+            "--kv_seq_len",
+            "512",
+            "--num_sharding",
+            "4",
+        ]
+        if self.compile_only:
+            cmds.extend(["--compile_only"])
+        elif self.device:
+            cmds.extend(["--device", self.device])
+        if self.host:
+            cmds.extend(["--host", self.host])
+        elif self.enable_x86_64:
+            cmds.extend(["--enable_x86_64"])
+        if self.pre_gen_pte:
+            cmds.extend(["--pre_gen_pte", self.pre_gen_pte])
+
+        golden_start_with = "<|begin_of_text|><|start_header_id|>user<|end_header_id|>"
+        p = subprocess.Popen(cmds, stdout=subprocess.DEVNULL)
+        with Listener((self.ip, self.port)) as listener:
+            conn = listener.accept()
+            p.communicate()
+            msg = json.loads(conn.recv())
+            if "Error" in msg:
+                self.fail(msg["Error"])
+            else:
+                if not self.compile_only:
+                    model_out = msg["result"][0]
+                    self.assertTrue(
+                        model_out.startswith(golden_start_with),
+                        f"Expected Output: {golden_start_with}. Actual Output: {model_out}",
+                    )
+                # x86 does not allow weight sharing, so we don't check pte size.
+                # Inference speed on x86 is slow, so we only check when running on Android
+                if not self.enable_x86_64:
+                    pte_size = msg["pte_size"]
+                    self.assertLessEqual(pte_size, 1300000000)
+                if not self.compile_only and not self.enable_x86_64:
+                    self.assertGreaterEqual(msg["inference_speed"], 66)  # Lanai
+
+    def test_llama_stories_110m(self):
+        if not self.required_envs():
+            self.skipTest("missing required envs")
+        assert (
+            self.llama_artifacts is not None
+        ), "Please provide path to llama artifacts"
+
+        prompt = "Once"
+        cmds = [
+            "python",
+            f"{self.executorch_root}/examples/qualcomm/oss_scripts/llama/llama.py",
+            "--artifact",
+            self.artifact_dir,
+            "--build_folder",
+            self.build_folder,
+            "--model",
+            self.model,
+            "--checkpoint",
+            f"{self.llama_artifacts}/stories110M.pt",
+            "--params",
+            f"{self.llama_artifacts}/params.json",
+            "--tokenizer_model",
+            f"{self.llama_artifacts}/tokenizer.model",
+            "--tokenizer_bin",
+            f"{self.llama_artifacts}/tokenizer.bin",
+            "--ip",
+            self.ip,
+            "--port",
+            str(self.port),
+            "--prompt",
+            f"{prompt}",
+            "--ptq",
+            "16a4w",
+            "--temperature",
+            "0",
+            "--llama_model",
+            "stories110m",
+            "--model_mode",
+            "hybrid",
+            "--prefill_seq_len",
+            "32",
+            "--kv_seq_len",
+            "128",
+        ]
+        if self.compile_only:
+            cmds.extend(["--compile_only"])
+        elif self.device:
+            cmds.extend(["--device", self.device])
+        if self.host:
+            cmds.extend(["--host", self.host])
+        elif self.enable_x86_64:
+            cmds.extend(["--enable_x86_64"])
+        if self.pre_gen_pte:
+            cmds.extend(["--pre_gen_pte", self.pre_gen_pte])
+
+        golden_start_with = "Once upon a time,"
+        p = subprocess.Popen(cmds, stdout=subprocess.DEVNULL)
+        with Listener((self.ip, self.port)) as listener:
+            conn = listener.accept()
+            p.communicate()
+            msg = json.loads(conn.recv())
+            if "Error" in msg:
+                self.fail(msg["Error"])
+            else:
+                if not self.compile_only:
+                    model_out = msg["result"][0]
+                    self.assertTrue(
+                        model_out.startswith(golden_start_with),
+                        f"Expected Output: {golden_start_with}. Actual Output: {model_out}",
+                    )
+                # x86 does not allow weight sharing, so we don't check pte size
+                if not self.enable_x86_64:
+                    pte_size = msg["pte_size"]
+                    self.assertLessEqual(pte_size, 130000000)
+                if not self.compile_only and not self.enable_x86_64:
+                    self.assertGreaterEqual(msg["inference_speed"], 220)  # Lanai
+
+
 class TestExampleOssScript(TestQNN):
     def required_envs(self, conditions=None) -> bool:
         conditions = [] if conditions is None else conditions
@@ -4001,72 +4168,6 @@ def test_deeplab_v3(self):
                 self.assertGreaterEqual(msg["MPA"], 0.70)
                 self.assertGreaterEqual(msg["MIoU"], 0.55)
 
-    def test_stories_single_llama(self):
-        if not self.required_envs():
-            self.skipTest("missing required envs")
-
-        cmds = [
-            "python",
-            f"{self.executorch_root}/examples/qualcomm/oss_scripts/llama/llama.py",
-            "--artifact",
-            self.artifact_dir,
-            "--build_folder",
-            self.build_folder,
-            "--model",
-            self.model,
-            "--checkpoint",
-            f"{self.artifact_dir}/stories110M.pt",
-            "--params",
-            f"{self.artifact_dir}/params.json",
-            "--tokenizer_model",
-            f"{self.artifact_dir}/tokenizer.model",
-            "--tokenizer_bin",
-            f"{self.artifact_dir}/tokenizer.bin",
-            "--ip",
-            self.ip,
-            "--port",
-            str(self.port),
-            "--prompt",
-            "Once",
-            "--ptq",
-            "16a4w",
-            "--temperature",
-            "0",
-            "--llama_model",
-            "stories110m",
-            "--model_mode",
-            "hybrid",
-            "--prefill_seq_len",
-            "32",
-            "--kv_seq_len",
-            "128",
-        ]
-        if self.compile_only:
-            cmds.extend(["--compile_only"])
-        elif self.device:
-            cmds.extend(["--device", self.device])
-        if self.host:
-            cmds.extend(["--host", self.host])
-        elif self.enable_x86_64:
-            cmds.extend(["--enable_x86_64"])
-
-        golden_start_with = "Once upon a time,"
-        p = subprocess.Popen(cmds, stdout=subprocess.DEVNULL)
-        with Listener((self.ip, self.port)) as listener:
-            conn = listener.accept()
-            p.communicate()
-            msg = json.loads(conn.recv())
-            if "Error" in msg:
-                self.fail(msg["Error"])
-            else:
-                if not self.compile_only:
-                    model_out = msg["result"][0]
-                    self.assertTrue(model_out.startswith(golden_start_with))
-                # x86 does not allow weight sharing, so we don't check pte size
-                if not self.enable_x86_64:
-                    pte_size = msg["pte_size"]
-                    self.assertLessEqual(pte_size, 130000000)
-
     @unittest.skip("dynamic shape inputs appear in recent torch.export.export")
     def test_mobilebert(self):
         if not self.required_envs([self.pretrained_weight]):
@@ -4271,6 +4372,18 @@ def setup_environment():
         type=str,
     )
 
+    parser.add_argument(
+        "--pre_gen_pte",
+        help="Run the pre-generated pte in the given directory.",
+        type=str,
+    )
+
+    parser.add_argument(
+        "--llama_artifacts",
+        help="A folder that contains: weight, tokenizer, and params.",
+        type=str,
+    )
+
     args, ns_args = parser.parse_known_args(namespace=unittest)
     TestQNN.host = args.host
     TestQNN.device = args.device
@@ -4289,6 +4402,8 @@ def setup_environment():
     TestQNN.enable_x86_64 = args.enable_x86_64
     TestQNN.dump_intermediate_outputs = args.dump_intermediate_outputs
     TestQNN.compile_only = args.compile_only
+    TestQNN.pre_gen_pte = args.pre_gen_pte
+    TestQNN.llama_artifacts = args.llama_artifacts
 
     return sys.argv[:1] + ns_args
 
diff --git a/backends/qualcomm/tests/utils.py b/backends/qualcomm/tests/utils.py
index 46cc9b65fc..eeebb6fd8a 100644
--- a/backends/qualcomm/tests/utils.py
+++ b/backends/qualcomm/tests/utils.py
@@ -188,6 +188,8 @@ class TestQNN(unittest.TestCase):
     shared_buffer: bool = False
     enable_x86_64: bool = False
     compile_only: bool = False
+    pre_gen_pte: str = ""
+    llama_artifacts: str = ""
 
     def _assert_outputs_equal(self, model_output, ref_output):
         self.assertTrue(len(ref_output) == len(model_output))
diff --git a/backends/transforms/fuse_view_copy.py b/backends/transforms/fuse_view_copy.py
index bbc155dc45..22e20d1c88 100644
--- a/backends/transforms/fuse_view_copy.py
+++ b/backends/transforms/fuse_view_copy.py
@@ -40,7 +40,24 @@ def merge_view_copy_chains(graph: torch.fx.Graph) -> torch.fx.Graph:
     return graph
 
 
+def remove_noop_view_copy(graph: torch.fx.Graph) -> torch.fx.Graph:
+    """
+    Remove view_copy nodes that are no-ops.
+    """
+    ops = exir_ops.edge
+    view_op = ops.aten.view_copy.default
+    for node in graph.nodes:
+        if node.op == "call_function" and node.target == view_op:
+            input_shape = list(node.args[0].meta["val"].shape)
+            target_shape = node.args[1]
+            if input_shape == target_shape:
+                node.replace_all_uses_with(node.args[0])
+    graph.eliminate_dead_code()
+    return graph
+
+
 class FuseViewCopyTransform(ExportPass):
     def call(self, graph_module: torch.fx.GraphModule) -> PassResult:
         graph_module.graph = merge_view_copy_chains(graph_module.graph)
+        graph_module.graph = remove_noop_view_copy(graph_module.graph)
         return PassResult(graph_module, True)
diff --git a/backends/transforms/targets.bzl b/backends/transforms/targets.bzl
index c532798546..ec4e141286 100644
--- a/backends/transforms/targets.bzl
+++ b/backends/transforms/targets.bzl
@@ -201,6 +201,20 @@ def define_common_targets():
         ],
     )
 
+    runtime.python_library(
+        name = "replace_scalar_with_tensor",
+        srcs = [
+            "replace_scalar_with_tensor.py",
+        ],
+        visibility = [
+            "//executorch/backends/...",
+        ],
+        deps = [
+            "//caffe2:torch",
+            "//executorch/exir:pass_base",
+        ],
+    )
+
     runtime.python_test(
         name = "test_duplicate_dynamic_quant_chain",
         srcs = [
diff --git a/backends/transforms/view_copy_to_squeeze_unsqueeze.py b/backends/transforms/view_copy_to_squeeze_unsqueeze.py
index f4a0670072..08ed70b2fa 100644
--- a/backends/transforms/view_copy_to_squeeze_unsqueeze.py
+++ b/backends/transforms/view_copy_to_squeeze_unsqueeze.py
@@ -75,7 +75,11 @@ def find_unsqueeze_dim(
         j = 0
         idx = -1
         while j < len(view_shape):
-            if input_shape[i] != view_shape[j]:
+            # account for added dim being last dim in view_shape
+            if i == j and j == len(input_shape):
+                if view_shape[j] != 1:
+                    return None
+            elif input_shape[i] != view_shape[j]:
                 if view_shape[j] == 1:
                     idx = j
                     i -= 1
diff --git a/backends/vulkan/_passes/TARGETS b/backends/vulkan/_passes/TARGETS
index 59658e58f2..5478ad0eab 100644
--- a/backends/vulkan/_passes/TARGETS
+++ b/backends/vulkan/_passes/TARGETS
@@ -31,14 +31,15 @@ runtime.python_library(
 )
 
 runtime.python_library(
-    name = "squeeze_int4_linear_inputs",
+    name = "squeeze_unsqueeze_inputs",
     srcs = [
-        "squeeze_int4_linear_inputs.py",
+        "squeeze_unsqueeze_inputs.py",
     ],
     visibility = [
         "//executorch/backends/...",
     ],
     deps = [
+        "//caffe2:torch",
         "//executorch/backends/vulkan:custom_ops_lib",
         "//executorch/exir:pass_base",
         "//executorch/exir/dialects:lib",
@@ -114,7 +115,7 @@ runtime.python_library(
         ":remove_asserts",
         ":remove_local_scalar_dense",
         ":remove_redundant_ops",
-        ":squeeze_int4_linear_inputs",
+        ":squeeze_unsqueeze_inputs",
         ":tag_memory_meta_pass",
     ]
 )
diff --git a/backends/vulkan/_passes/__init__.py b/backends/vulkan/_passes/__init__.py
index 2a4a2b4b5c..220afa6a35 100644
--- a/backends/vulkan/_passes/__init__.py
+++ b/backends/vulkan/_passes/__init__.py
@@ -20,8 +20,8 @@
 from executorch.backends.vulkan._passes.remove_redundant_ops import (
     RemoveRedundantOpsTransform,
 )
-from executorch.backends.vulkan._passes.squeeze_int4_linear_inputs import (
-    SqueezeInt4LinearInputs,
+from executorch.backends.vulkan._passes.squeeze_unsqueeze_inputs import (
+    SqueezeUnsqueezeInputs,
 )
 from executorch.backends.vulkan._passes.tag_memory_meta_pass import TagMemoryMetaPass
 
@@ -32,6 +32,6 @@
     "RemoveAssertsTransform",
     "RemoveLocalScalarDenseOpsTransform",
     "RemoveRedundantOpsTransform",
-    "SqueezeInt4LinearInputs",
+    "SqueezeUnsqueezeInputs",
     "TagMemoryMetaPass",
 ]
diff --git a/backends/vulkan/_passes/squeeze_int4_linear_inputs.py b/backends/vulkan/_passes/squeeze_unsqueeze_inputs.py
similarity index 80%
rename from backends/vulkan/_passes/squeeze_int4_linear_inputs.py
rename to backends/vulkan/_passes/squeeze_unsqueeze_inputs.py
index 95fcef7f75..a0160efa90 100644
--- a/backends/vulkan/_passes/squeeze_int4_linear_inputs.py
+++ b/backends/vulkan/_passes/squeeze_unsqueeze_inputs.py
@@ -6,16 +6,27 @@
 
 # pyre-strict
 
-from typing import Dict, List, Tuple
+from typing import Dict, List, Set, Tuple, Union
 
 import executorch.backends.vulkan.custom_ops_lib  # noqa: needed to access vk op
 from executorch.exir.dialects._ops import ops as exir_ops
+from executorch.exir.dialects.edge._ops import EdgeOpOverload
 from executorch.exir.pass_base import ExportPass, NodeMetadata, ProxyValue
 
+from torch._ops import OpOverload
+
 from torch.fx.node import Argument
 
+OpType = Union[str, OpOverload, EdgeOpOverload]
+
+
+class SqueezeUnsqueezeInputs(ExportPass):
+    _squeezable_ops: Set[OpType] = {
+        exir_ops.edge.et_vk.linear_weight_int4.default,
+        exir_ops.edge.aten.relu.default,
+        exir_ops.edge.aten.gelu.default,
+    }
 
-class SqueezeInt4LinearInputs(ExportPass):
     def call_operator(
         self,
         op,  # pyre-ignore
@@ -26,7 +37,7 @@ def call_operator(
         def _squeezable(shape: List[int]) -> bool:
             return len(shape) > 2 and 1 in shape
 
-        if op != exir_ops.edge.et_vk.linear_weight_int4.default:
+        if op not in self._squeezable_ops:
             return super().call_operator(op, args, kwargs, meta)
 
         # pyre-ignore[16]: `None` has no attribute `node`
diff --git a/backends/vulkan/runtime/graph/ops/glsl/batchnorm.glsl b/backends/vulkan/runtime/graph/ops/glsl/batchnorm.glsl
index deb03192af..c2fc5a5675 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/batchnorm.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/batchnorm.glsl
@@ -13,24 +13,18 @@
 
 layout(std430) buffer;
 
-layout(set = 0, binding = 0, ${IMAGE_FORMAT[DTYPE]}) uniform PRECISION restrict writeonly ${IMAGE_T[NDIM][DTYPE]} image_out;
-layout(set = 0, binding = 1) uniform PRECISION sampler3D image_in;
-layout(set = 0, binding = 2) uniform PRECISION sampler3D weight_in;
-layout(set = 0, binding = 3) uniform PRECISION sampler3D bias_in;
-layout(set = 0, binding = 4) uniform PRECISION sampler3D mean_in;
-layout(set = 0, binding = 5) uniform PRECISION sampler3D var_in;
+#include "indexing_utils.h"
 
-layout(set = 0, binding = 6) uniform PRECISION restrict OutLimits {
-  ivec3 out_limits;
-};
+${layout_declare_tensor(B, "w", "t_out", DTYPE, STORAGE)}
+${layout_declare_tensor(B, "r", "t_in", DTYPE, STORAGE)}
+${layout_declare_tensor(B, "r", "weight_in", DTYPE, STORAGE)}
+${layout_declare_tensor(B, "r", "bias_in", DTYPE, STORAGE)}
+${layout_declare_tensor(B, "r", "mean_in", DTYPE, STORAGE)}
+${layout_declare_tensor(B, "r", "var_in", DTYPE, STORAGE)}
 
-layout(set = 0, binding = 7) uniform PRECISION restrict Params {
-  float eps;
-};
-
-layout(set = 0, binding = 8) uniform PRECISION restrict Params2 {
-  int num_texel_per_batch;
-};
+${layout_declare_ubo(B, "ivec3", "out_limits")}
+${layout_declare_ubo(B, "float", "eps")}
+${layout_declare_ubo(B, "int", "num_texel_per_batch")}
 
 layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 
@@ -40,16 +34,16 @@ void main() {
     return;
   }
 
-  VEC4_T v = VEC4_T(texelFetch(image_in, pos, 0));
+  VEC4_T v = VEC4_T(load_texel(t_in, pos));
 
   ivec3 param_pos = ivec3(pos.z % num_texel_per_batch, 0, 0);
 
-  VEC4_T weight = VEC4_T(texelFetch(weight_in, param_pos, 0));
-  VEC4_T bias = VEC4_T(texelFetch(bias_in, param_pos, 0));
-  VEC4_T mean = VEC4_T(texelFetch(mean_in, param_pos, 0));
-  VEC4_T var = VEC4_T(texelFetch(var_in, param_pos, 0));
+  VEC4_T weight = VEC4_T(load_texel(weight_in, param_pos));
+  VEC4_T bias = VEC4_T(load_texel(bias_in, param_pos));
+  VEC4_T mean = VEC4_T(load_texel(mean_in, param_pos));
+  VEC4_T var = VEC4_T(load_texel(var_in, param_pos));
 
   v = ((v - mean) / sqrt(var + eps)) * weight + bias;
 
-  imageStore(image_out, pos, v);
+  write_texel(t_out, pos, v);
 }
diff --git a/backends/vulkan/runtime/graph/ops/glsl/batchnorm.yaml b/backends/vulkan/runtime/graph/ops/glsl/batchnorm.yaml
index a92e44f636..116773c816 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/batchnorm.yaml
+++ b/backends/vulkan/runtime/graph/ops/glsl/batchnorm.yaml
@@ -2,6 +2,7 @@ batchnorm:
   parameter_names_with_default_values:
     DTYPE: float
     NDIM: 3
+    STORAGE: texture3d
   generate_variant_forall:
     DTYPE:
       - VALUE: half
diff --git a/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp b/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp
index 71b7ce80cc..18599ed4ba 100644
--- a/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp
@@ -475,7 +475,12 @@ void add_conv1d_node(
     const ValueRef out,
     const bool clamp_out) {
   ValueRef arg_weight = prepack_standard(
-      graph, weight, graph.storage_type_of(out), utils::kChannelsPacked);
+      graph,
+      weight,
+      graph.storage_type_of(out),
+      utils::kChannelsPacked,
+      /* passthrough = */ false,
+      utils::kOptimizedAxisMap);
   ValueRef arg_bias = prepack_biases(
       graph,
       bias,
diff --git a/backends/vulkan/vulkan_preprocess.py b/backends/vulkan/vulkan_preprocess.py
index c6b444e5de..3cfcac13a8 100644
--- a/backends/vulkan/vulkan_preprocess.py
+++ b/backends/vulkan/vulkan_preprocess.py
@@ -26,7 +26,7 @@
     insert_prepack_nodes,
     RemoveLocalScalarDenseOpsTransform,
     RemoveRedundantOpsTransform,
-    SqueezeInt4LinearInputs,
+    SqueezeUnsqueezeInputs,
     TagMemoryMetaPass,
 )
 
@@ -153,7 +153,7 @@ def preprocess(  # noqa: C901
                 RemoveRedundantOpsTransform(),
                 AddmmToLinearTransform(),
                 FuseDequantLinearPass(),
-                SqueezeInt4LinearInputs(),
+                SqueezeUnsqueezeInputs(),
                 FuseViewCopyTransform(),
                 ViewCopyToSqueezeUnsqueezePass(),
                 FuseBatchNormWithConvPass(program),
diff --git a/backends/xnnpack/partition/config/gemm_configs.py b/backends/xnnpack/partition/config/gemm_configs.py
index bf16855afc..872ba355c7 100644
--- a/backends/xnnpack/partition/config/gemm_configs.py
+++ b/backends/xnnpack/partition/config/gemm_configs.py
@@ -210,6 +210,11 @@ def _get_bias_deps(
         self, node: torch.fx.Node, ep: ExportedProgram, precision: ConfigPrecisionType
     ) -> Tuple[bool, List[torch.fx.Node]]:
         gemm_deps = []
+        if precision == ConfigPrecisionType.FP32 and self.force_fp32_dynamic_linear:
+            # if force force_fp32_dynamic_linear is enabled, then we
+            # do not partition the weight node
+            return (True, gemm_deps)
+
         if len(node.all_input_nodes) > 2 and self.bias_idx is not None:
             bias_node = get_input_node(node, self.bias_idx)
             if bias_node:
@@ -477,7 +482,15 @@ def find_partition_args(input_node):
         node.args = old_args
         node.users = old_users
 
-        return valid_deps, list(set(deps) | set(src_partition.nodes))
+        # When using force_fp32_dynamic_linear, we want to get_deps to overwrite the source partition nodes.
+        # Else we want to be greedy.
+        ret_deps = (
+            list(set(deps) & set(src_partition.nodes))
+            if self.force_fp32_dynamic_linear
+            else list(set(deps) | set(src_partition.nodes))
+        )
+
+        return valid_deps, ret_deps
 
     def supported_precision_types(self):
         return [
diff --git a/backends/xnnpack/test/ops/test_linear.py b/backends/xnnpack/test/ops/test_linear.py
index eccda406b8..30bb4f0aba 100644
--- a/backends/xnnpack/test/ops/test_linear.py
+++ b/backends/xnnpack/test/ops/test_linear.py
@@ -31,6 +31,8 @@
     ToEdgeTransformAndLower,
 )
 
+from torch.export.graph_signature import ExportGraphSignature, InputKind
+
 try:
     from torchao.quantization.quant_api import (
         int8_dynamic_activation_int4_weight,
@@ -871,3 +873,71 @@ def test_linear_qd8_as_fp32(self):
                     "dequantize_per_channel.default": 1,  # 1: weight
                 },
             )
+
+    def test_linear_fp32_with_force_as_mm(self):
+        def check_signature(
+            signature: ExportGraphSignature,
+            force_flag: bool,
+            use_bias: bool,
+            legacy_mode: bool,
+        ):
+            num_params = 0
+            if force_flag:
+                num_params = 1  # weight_param
+                if use_bias:
+                    num_params += 1  # bias_param
+            sign_params: int = 0
+            input_specs = signature.input_specs
+            for input_spec in input_specs:
+                if input_spec.kind == InputKind.PARAMETER:
+                    sign_params += 1
+            assert (
+                sign_params == num_params
+            ), f"Expected {num_params} params, got {sign_params} with force_flag={force_flag}, use_bias={use_bias}, legacy_mode={legacy_mode}"
+
+        for force_flag in (True, False):
+            for use_bias in (True, False):
+                for legacy_mode in (True, False):
+                    module = BaseLinear(
+                        in_size=8,
+                        input_channels=13,
+                        output_channels=17,
+                        use_bias=use_bias,
+                    )
+                    inputs = module.get_inputs()
+                    tester = Tester(module, inputs).export()
+                    partitioner = XnnpackPartitioner(
+                        force_fp32_dynamic_linear=force_flag
+                    )
+                    if legacy_mode:
+                        tester.to_edge()
+                        partitioner_stage = Partition(partitioner=partitioner)
+                        tester.partition(partition_stage=partitioner_stage)
+                        tester.check_not(
+                            [
+                                (
+                                    "executorch_exir_dialects_edge__ops_aten_mm_default"
+                                    if use_bias
+                                    else "executorch_exir_dialects_edge__ops_aten_addmm_default"
+                                )
+                            ]
+                        )
+                    else:
+                        to_edge_and_transform_stage = ToEdgeTransformAndLower(
+                            partitioners=[partitioner]
+                        )
+                        tester.to_edge_transform_and_lower(
+                            to_edge_and_transform_stage=to_edge_and_transform_stage
+                        )
+                        tester.check_not(
+                            ["executorch_exir_dialects_edge__ops_aten_linear_default"]
+                        )
+
+                    signature: ExportGraphSignature = (
+                        tester.get_artifact().exported_program().graph_signature
+                    )
+                    check_signature(signature, force_flag, use_bias, legacy_mode)
+
+                    tester.to_executorch()
+                    tester.serialize()
+                    tester.run_method_and_compare_outputs()
diff --git a/backends/xnnpack/test/ops/test_lstm.py b/backends/xnnpack/test/ops/test_lstm.py
index bfc6113c41..be209082b3 100644
--- a/backends/xnnpack/test/ops/test_lstm.py
+++ b/backends/xnnpack/test/ops/test_lstm.py
@@ -54,9 +54,8 @@ def test_fp32_lstm_force_dynamic_linear(self):
             )
             .check_not(["executorch_exir_dialects_edge__ops_aten_addmm_default"])
             # Weights are supplied as input to linears
-            .check(["p_lstm_weight_hh_l0", "p_lstm_weight_ih_l0"])
-            # Biases are owned by delegates
-            .check_not(["p_lstm_bias"])
+            # Biases are not owned by delegates when force_fp32_dynamic_linear is set
+            .check(["p_lstm_weight_hh_l0", "p_lstm_weight_ih_l0", "p_lstm_bias"])
             .to_executorch()
             .serialize()
             .run_method_and_compare_outputs()
diff --git a/build/Utils.cmake b/build/Utils.cmake
index a27edf3366..113f4829b8 100644
--- a/build/Utils.cmake
+++ b/build/Utils.cmake
@@ -357,7 +357,7 @@ function(add_torch_to_cmake_prefix_path)
   endif()
   execute_process(
     COMMAND "${PYTHON_EXECUTABLE}" -c
-            "import torch as _; print(_.__path__[0], end='')"
+            "import importlib.util; print(importlib.util.find_spec('torch').submodule_search_locations[0])"
     OUTPUT_VARIABLE _tmp_torch_path
     ERROR_VARIABLE _tmp_torch_path_error
     RESULT_VARIABLE _tmp_torch_path_result COMMAND_ECHO STDERR
diff --git a/docs/source/executorch-arm-delegate-tutorial.md b/docs/source/executorch-arm-delegate-tutorial.md
index ff6d4abbba..feb8f0335f 100644
--- a/docs/source/executorch-arm-delegate-tutorial.md
+++ b/docs/source/executorch-arm-delegate-tutorial.md
@@ -200,7 +200,7 @@ Following script will serve as a helper utility to help us generate the `.pte` f
 
 ```bash
 python3 -m examples.arm.aot_arm_compiler --model_name="softmax"
-# This should produce ./softmax.pte
+# This should produce ./softmax_arm_ethos-u55-128.pte
 ```
 
 ### Delegated Workflow
@@ -221,12 +221,14 @@ Similar to the non-delegate flow, the same script will server as a helper utilit
 
 ```bash
 python3 -m examples.arm.aot_arm_compiler --model_name="add" --delegate
-# should produce ./add_arm_delegate.pte
+# should produce ./add_arm_delegate_ethos-u55-128.pte
 ```
 
 ### Delegated Quantized Workflow
 Before generating the `.pte` file for delegated quantized networks like MobileNetV2, we need to build the `quantized_ops_aot_lib`
 
+You can just run the `backends/arm/scripts/build_quantized_ops_aot_lib.sh` script to build this for you or build it yourself like this. 
+
 ```bash
 
 cd <executorch_root_dir>
@@ -245,7 +247,7 @@ cmake --build cmake-out-aot-lib --parallel -- quantized_ops_aot_lib
 After the `quantized_ops_aot_lib` build, we can run the following script to generate the `.pte` file
 ```bash
 python3 -m examples.arm.aot_arm_compiler --model_name="mv2" --delegate --quantize --so_library="$(find cmake-out-aot-lib -name libquantized_ops_aot_lib.so)"
-# should produce ./mv2_arm_delegate.pte.pte
+# should produce ./mv2_arm_delegate_ethos-u55-128.pte
 ```
 
 <br />
@@ -262,6 +264,14 @@ Now let's try to run these `.pte` files on a Corstone-300 and Corstone-320 platf
 
 In this section, we will go over steps that you need to go through to build the runtime application. This then run on the target device. In the executorch repository we have a functioning script which does the exact same steps. It is located at `executorch/examples/arm/run.sh`. We will use that to build necessary pieces and finally run the previously generated PTE file on an FVP.
 
+By default the `run.sh` will use `arm_test/` as an build and output folder and you will find the build artifacts under it. This can be contolled/overrided with the `--et_build_root` and the `--output` flags if needed.
+
+e.g. running `examples/arm/run.sh --model_name=add --target=ethos-u85-128` will produce a pte and elf file like this:
+
+```bash
+arm_test/add/add_arm_delegate_ethos-u85-128.pte
+arm_test/add/cmake-out/arm_executor_runner
+```
 Also before we get started, make sure that you have completed ExecuTorch cmake build setup, and the instructions to setup the development environment described [earlier](#set-up-the-developer-environment).
 
 The block diagram below demonstrates, at the high level, how the various build artifacts are generated and are linked together to generate the final bare-metal executable.
@@ -286,23 +296,19 @@ To run a `.pte` file with the Arm backend delegate call instructions, we will ne
 
 - `libexecutorch_delegate_ethos_u.a`
 
-These libraries are generated in `build_executorch` and `build_quantization_aot_lib` function of the `run.sh` script.
+These libraries are generated by the `backends/arm/scripts/build_executorch.sh`, `backends/arm/scripts/build_portable_kernels.sh` and `backends/arm/scripts/build_quantized_ops_aot_lib.sh` scripts called from the `run.sh` script.
 
-In this function, `EXECUTORCH_SELECT_OPS_LIST` will decide the number of portable operators included in the build and are available at runtime. It must match with `.pte` file's requirements, otherwise you will get `Missing Operator` error at runtime.
+The `--portable_kernels` flag can be used to set the build flag `EXECUTORCH_SELECT_OPS_LIST` when running `backends/arm/scripts/build_portable_kernels.sh` that will decide the number of portable operators included in the build and are available at runtime. It must match with `.pte` file's requirements, otherwise you will get `Missing Operator` error at runtime.
 
 For example, there  in the command line above, to run SoftmaxModule, we only included the softmax CPU operator. Similarly, to run AddModule in a non-delegated manner you will need add op and so on. As you might have already realized, for the delegated operators, which will be executed by the Arm backend delegate, we do not need to include those operators in this list. This is only for *non-delegated* operators.
 
-```{tip}
-The `run.sh` script takes in `--portable_kernels` option, which provides a way to supply a comma seperated list of portable kernels to be included.
-```
-
 ### Building the executor_runner Bare-Metal Application
 
 The SDK dir is the same one prepared [earlier](#setup-the-arm-ethos-u-software-development). And, we will be passing the `.pte` file (any one of them) generated above.
 
 Note, you have to generate a new `executor-runner` binary if you want to change the model or the `.pte` file. This constraint is from the constrained bare-metal runtime environment we have for Corstone-300/Corstone-320 platforms.
 
-This is performed by the `build_executorch_runner` function in `run.sh`.
+This is performed by the `backends/arm/scripts/build_executorch_runner.sh` script runned from `run.sh`.
 
 ```{tip}
 The `run.sh` script takes in `--target` option, which provides a way to provide a specific target, Corstone-300(ethos-u55-128) or Corstone-320(ethos-u85-128)
@@ -310,7 +316,10 @@ The `run.sh` script takes in `--target` option, which provides a way to provide
 
 ## Running on Corstone FVP Platforms
 
-Once the elf is prepared, regardless of the `.pte` file variant is used to generate the bare metal elf. The below command is used to run the [MV2Model](#mv2module) on Corstone-320 FVP
+Once the elf is prepared, regardless of the `.pte` file variant is used to generate the bare metal elf. `run.sh` will run the FVP for you via the `backends/arm/scripts/run_fvp.sh` script but you can also run it directly.
+
+
+The below command is used to run the [MV2Model](#mv2module) on Corstone-320 FVP
 
 ```bash
 ethos_u_build_dir=examples/arm/executor_runner/
diff --git a/examples/apple/coreml/llama/export.py b/examples/apple/coreml/llama/export.py
new file mode 100644
index 0000000000..58bc0859c7
--- /dev/null
+++ b/examples/apple/coreml/llama/export.py
@@ -0,0 +1,285 @@
+# (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
+
+# pyre-strict
+
+import argparse
+import json
+
+import sys
+
+import coremltools as ct
+import torch
+from executorch.backends.apple.coreml.compiler import CoreMLBackend  # pyre-ignore
+from executorch.backends.apple.coreml.partition import CoreMLPartitioner  # pyre-ignore
+from executorch.examples.models.llama.source_transformation.quantize import (
+    EmbeddingQuantHandler,
+)
+
+from executorch.exir.backend.utils import format_delegated_graph
+from executorch.exir.capture._config import EdgeCompileConfig, ExecutorchBackendConfig
+from executorch.exir.passes import MemoryPlanningPass
+from executorch.exir.passes.quant_fusion_pass import QuantFusionPass
+from executorch.exir.passes.sym_shape_eval_pass import ConstraintBasedSymShapeEvalPass
+from executorch.extension.export_util.utils import export_to_edge, save_pte_program
+
+sys.path.insert(0, ".")
+from llama_transformer import InputManager, ModelArgs, Transformer
+
+
+class SplitLinearModule(torch.nn.Module):
+    def __init__(self, in_features, out_features, target_split_size, max_splits):
+        super(SplitLinearModule, self).__init__()
+        num_splits = max(out_features // target_split_size, 1)
+        if num_splits > max_splits:
+            num_splits = max_splits
+
+        self.split_size = out_features // num_splits
+        self.split_remainder = out_features % num_splits
+        self.splits = torch.nn.ModuleList(
+            [torch.nn.Linear(in_features, self.split_size) for _ in range(num_splits)]
+        )
+        print(
+            f"Splitting out_features={out_features} into {num_splits} of size {self.split_size}"
+        )
+        if self.split_remainder > 0:
+            print(
+                f"Warning: remainder {self.split_remainder} after splitting out_features={out_features} into {num_splits} of size {self.split_size}"
+            )
+            self.splits.append(torch.nn.Linear(in_features, self.split_remainder))
+
+    def split_sizes(self):
+        return [split.out_features for split in self.splits]
+
+    def forward(self, x):
+        return torch.cat([split(x) for split in self.splits], dim=-1)
+
+
+def replace_linear_with_split_linear(model, target_split_size, max_splits):
+    for name, module in model.named_children():
+        if isinstance(module, torch.nn.Linear):
+            new_module = SplitLinearModule(
+                module.in_features, module.out_features, target_split_size, max_splits
+            )
+            split_sizes = new_module.split_sizes()
+            if module.bias is not None:
+                split_bias = module.bias.split(split_sizes)
+            split_weights = module.weight.split(split_sizes, dim=0)
+            for i, split in enumerate(new_module.splits):
+                split.weight = torch.nn.Parameter(split_weights[i])
+                if module.bias is not None:
+                    split.bias = torch.nn.Parameter(split_bias[i])
+                else:
+                    split.bias = None
+            setattr(model, name, new_module)
+        else:
+            replace_linear_with_split_linear(module, target_split_size, max_splits)
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "-n",
+        "--output_name",
+        default="model.pte",
+        help="Override the output filename of the saved pte model file.",
+    )
+    parser.add_argument(
+        "-p",
+        "--params",
+        help="config.json",
+    )
+    parser.add_argument(
+        "-c",
+        "--checkpoint",
+        help="checkpoint path",
+    )
+    parser.add_argument(
+        "--seq_length",
+        type=int,
+        default=1,
+        help="length sequence to evaluate",
+    )
+    parser.add_argument(
+        "--max_seq_length",
+        type=int,
+        default=128,
+        help="maximum length sequence to evaluate",
+    )
+    parser.add_argument(
+        "--cache_size",
+        type=int,
+        default=None,
+        help="Cache size.  Old items are evicted from cache",
+    )
+    parser.add_argument(
+        "-E",
+        "--embedding-quantize",
+        default=None,
+        type=str,
+        help="type of embedding quantization, '<bitwidth>,<groupsize>', e.g., '8,1024'.",
+    )
+    parser.add_argument(
+        "--coreml-quantize",
+        default=None,
+        choices=["b4w", "c4w"],
+        help="This option is only for coreml: Use coreml quantization, e.g. b4w (for blockwise 4 bit weight), c4w (for channelwise 4 bit weight)",
+    )
+    parser.add_argument(
+        "--use_cache_list",
+        action="store_true",
+        help="Use cache list to speed up model computation (does not work in pybindings)",
+    )
+    parser.add_argument(
+        "--target_split_size",
+        type=int,
+        default=None,
+        help="Split linear layers into smaller chunks of target_split_size.",
+    )
+    parser.add_argument(
+        "--max_splits",
+        type=int,
+        default=8,
+        help="Maximum number of splits to divide linear layers",
+    )
+
+    export_args = parser.parse_args()
+    params_path = export_args.params
+    checkpoint_path = export_args.checkpoint
+
+    # Load model args
+    with open(params_path, "r") as f:
+        params = json.loads(f.read())
+
+    args = ModelArgs(
+        max_seq_len=export_args.max_seq_length,
+        generate_full_logits=False,
+        use_cache_list=export_args.use_cache_list,
+        **params,
+    )
+
+    with torch.device("meta"):
+        model = Transformer(args)
+
+    checkpoint = torch.load(
+        checkpoint_path, map_location="cpu", mmap=True, weights_only=True
+    )
+    if "model" in checkpoint:
+        checkpoint = checkpoint["model"]
+
+    missing, unexpected = model.load_state_dict(
+        checkpoint,
+        strict=False,
+        assign=True,
+    )
+    print("Missing keys: ", missing)
+    print("Unexpected keys: ", unexpected)
+
+    float_dtype = torch.float16  # dtype for model/inputs
+    model.eval()
+    model.to(float_dtype)
+
+    if export_args.embedding_quantize:
+        bitwidth, group_size = export_args.embedding_quantize.split(",")
+        if group_size == "none" or group_size == "None" or group_size == "0":
+            group_size = None
+        else:
+            group_size = int(group_size)
+        bitwidth = int(bitwidth)
+        model = EmbeddingQuantHandler(
+            model,
+            bitwidth=bitwidth,
+            group_size=group_size,
+            packed=(bitwidth in [2, 4]),
+        ).quantized_model()
+
+    if export_args.target_split_size is not None:
+        replace_linear_with_split_linear(
+            model, export_args.target_split_size, export_args.max_splits
+        )
+
+    model = model.to(float_dtype)
+
+    op_linear_quantizer_config = None
+    if export_args.coreml_quantize == "b4w":
+        op_linear_quantizer_config = {
+            "mode": "linear_symmetric",
+            "dtype": "int4",
+            "granularity": "per_block",
+            "block_size": 32,
+            "weight_threshold": 512,
+        }
+    elif export_args.coreml_quantize == "c4w":
+        op_linear_quantizer_config = {
+            "mode": "linear_symmetric",
+            "dtype": "int4",
+            "granularity": "per_channel",
+        }
+
+    compile_specs = CoreMLBackend.generate_compile_specs(  # pyre-fixme[16]
+        minimum_deployment_target=ct.target.iOS18,
+        compute_precision=ct.precision(ct.precision.FLOAT16.value),
+        compute_unit=ct.ComputeUnit.CPU_AND_NE,
+        model_type=CoreMLBackend.MODEL_TYPE.MODEL,  # pyre-fixme[16]
+        op_linear_quantizer_config=op_linear_quantizer_config,
+    )
+    partitioner = CoreMLPartitioner(  # pyre-fixme[16]
+        compile_specs=compile_specs,
+        take_over_mutable_buffer=False,
+        skip_ops_for_coreml_delegation=[
+            "quantized_decomposed.embedding_4bit.dtype",
+            "aten.embedding.default",
+        ],
+    )
+
+    input_manager = InputManager(
+        n_layers=args.n_layers,
+        max_batch_size=args.max_batch_size,
+        n_kv_heads=args.n_kv_heads,
+        max_seq_length=args.max_seq_len,
+        head_dim=args.head_dim,
+        use_cache_list=export_args.use_cache_list,
+        seq_length=export_args.seq_length,
+        dtype=float_dtype,
+        minus_infinity=-30000,
+        cache_size=export_args.cache_size,
+    )
+    example_inputs = input_manager.get_inputs(tokens=[0])
+
+    edge_manager = export_to_edge(
+        model,
+        example_inputs,
+        edge_compile_config=EdgeCompileConfig(
+            _check_ir_validity=False,
+            _skip_type_promotion=(float_dtype == torch.float16),
+            _skip_dim_order=True,
+        ),
+    )
+    print("Edge program")
+    print(edge_manager.exported_program())
+
+    for node in edge_manager.exported_program().graph_module.graph.nodes:
+        print(node.name, node.target, node.args, node.kwargs)
+
+    edge_manager = edge_manager.to_backend(partitioner)
+
+    print("Delegated program")
+
+    print(format_delegated_graph(edge_manager.exported_program().graph_module))
+
+    executorch_program = edge_manager.to_executorch(
+        ExecutorchBackendConfig(
+            extract_delegate_segments=True,
+            passes=[
+                QuantFusionPass(),
+            ],
+            memory_planning_pass=MemoryPlanningPass(alloc_graph_input=False),
+            sym_shape_eval_pass=ConstraintBasedSymShapeEvalPass(),
+        )
+    )
+
+    filename = save_pte_program(executorch_program, export_args.output_name)
+    print(f"Saved Executorch program to local {filename}")
+
+
+if __name__ == "__main__":
+    main()  # pragma: no cover
diff --git a/examples/apple/coreml/llama/llama_transformer.py b/examples/apple/coreml/llama/llama_transformer.py
new file mode 100644
index 0000000000..5788bcd5e5
--- /dev/null
+++ b/examples/apple/coreml/llama/llama_transformer.py
@@ -0,0 +1,570 @@
+# @lint-ignore-every LICENSELINT
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# Copyright (c) Meta Platforms, Inc. All Rights Reserved.
+
+# Please refer to README.md in the same folder for more information.
+
+from dataclasses import dataclass
+from functools import partial
+from typing import Dict, List, Optional, Tuple
+
+import torch
+import torch.nn.functional as F
+
+from executorch.examples.models.llama.llama_transformer import RMSNorm
+
+from executorch.examples.models.llama.rope import (
+    hf_apply_rotary_emb,
+    hf_precompute_freqs_cis,
+    precompute_freqs_cis,
+    RotaryEmbedding,
+)
+
+from torch import nn
+
+
+# These are just to prevent to_edge from decomposing SDPA
+# A better method is to use the to_edge_transform_and_lower API for CoreML
+# and not decompose SDPA
+@torch.library.custom_op("coreml::sdpa", mutates_args=())
+def sdpa(
+    q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, attn_mask: torch.Tensor
+) -> torch.Tensor:
+    """Same as F.scaled_dot_product_attention, but with custom op to avoid lowering during dialect conversion."""
+    return torch.ops.aten.scaled_dot_product_attention.default(
+        q, k, v, attn_mask=attn_mask
+    )
+
+
+@torch.library.register_fake("coreml::sdpa")
+def _(
+    q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, attn_mask: torch.Tensor
+) -> torch.Tensor:
+    """Fake implementation with the right output shape, which is required for torch.compile/export/fx tracing."""
+    expected_shape = list(q.shape)
+    expected_shape[-1] = v.shape[-1]
+    return q.new_empty(expected_shape)
+
+
+def find_multiple(n: int, k: int) -> int:
+    if n % k == 0:
+        return n
+    return n + k - (n % k)
+
+
+@dataclass
+class ModelArgs:
+    dim: int = 2048
+    n_layers: int = 16
+    n_heads: int = 32
+    n_kv_heads: Optional[int] = None
+    vocab_size: int = 128256
+    hidden_dim: Optional[int] = None
+    head_dim: Optional[int] = None  # Optional customized head_dim
+    multiple_of: int = 256
+    ffn_dim_multiplier: Optional[float] = None
+    norm_eps: float = 1e-5
+    max_batch_size: int = 1
+    max_seq_len: int = 128
+    max_context_len: int = 2048
+    moe: bool = False  # True to enable the MoE (Mixture of Experts)
+    num_experts: int = 8  # Number of experts
+    num_activated_experts: int = 2  # Number of experts to activate
+
+    # Generate logits for all inputs. When it's True, it would take big memory usage
+    # at runtime. Enable it only necessary (e.g., use perplexity tools that requires
+    # logits for all input tokens.)
+    generate_full_logits: bool = False
+    # A dictionary mapping from pruned token-id to original token-id
+    input_prune_map: Optional[Dict[int, int]] = None
+    # A dictionary mapping from pruned token-id to original token-id
+    output_prune_map: Optional[Dict[int, int]] = None
+    use_hf_rope: bool = False  # Use HuggingFace's RoPE implementation
+    rope_theta: Optional[float] = (
+        None  # The official name to override self.rope_freq_base.
+    )
+    rope_freq_base: float = 10000.0  # The base frequency for RoPE. Keep it for BC.
+    use_scaled_rope: bool = True  # Use scaled RoPE, introduced in llama3.1.
+    # Additional Model Metadata needed at runtime
+    rope_scale_factor: int = 8
+    bos_idx: int = 1
+    eos_idx: int = 3
+    bos_count: int = -1  # i.e., a single EOS is used as BOS
+    eos_count: int = 2
+
+    quantization_args: Optional[dict] = None
+    lora_args: Optional[dict] = None
+
+    use_cache_list: bool = True
+
+    def __post_init__(self):
+        if self.n_kv_heads is None:
+            self.n_kv_heads = self.n_heads
+
+        # rope_theta overrides rope_freq_base since it's the official name.
+        if self.rope_theta is not None:
+            self.rope_freq_base = self.rope_theta
+
+        if self.hidden_dim is None:
+            # If hidden_dim is not explicitly set in the ModelArgs,
+            # then calculate implicitly based on dim and also multiple of `args.multiple_of`
+            multiple_of = self.multiple_of
+            hidden_dim = 4 * self.dim
+            hidden_dim = int(2 * hidden_dim / 3)
+            if self.ffn_dim_multiplier is not None:
+                hidden_dim = int(self.ffn_dim_multiplier * hidden_dim)
+            self.hidden_dim = find_multiple(hidden_dim, multiple_of)
+
+        if self.head_dim is None:
+            self.head_dim = self.dim // self.n_heads
+
+
+class Rope(torch.nn.Module):
+    def __init__(self, params: ModelArgs):
+        super().__init__()
+        self.params = params
+        if self.params.use_hf_rope:
+            self.precompute_freqs_cis = hf_precompute_freqs_cis
+        else:
+            self.precompute_freqs_cis = partial(
+                precompute_freqs_cis, use_scaled=self.params.use_scaled_rope
+            )
+        freqs_cos, freqs_sin = self.precompute_freqs_cis(
+            self.params.head_dim,
+            (
+                self.params.max_context_len  # Normal llama2.
+                if self.params.ffn_dim_multiplier is None
+                else self.params.max_context_len * 2  # Sharded checkpoint.
+            ),
+            self.params.rope_freq_base,
+            scale_factor=8,
+        )
+        self.register_buffer("freqs_cos", freqs_cos, persistent=False)
+        self.register_buffer("freqs_sin", freqs_sin, persistent=False)
+        if self.params.use_hf_rope:
+            self.apply_rotary_emb = hf_apply_rotary_emb
+        else:
+            self.apply_rotary_emb = RotaryEmbedding()
+
+    def forward(
+        self,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        freqs_cos: torch.Tensor,
+        freqs_sin: torch.Tensor,
+    ):
+        return self.apply_rotary_emb(q, k, freqs_cos, freqs_sin)
+
+    def get_freqs(self, input_pos: Optional[torch.Tensor], seq_len: int):
+        """
+        Get the precomputed frequencies for the given input position and sequence length.
+
+        Args:
+            input_pos (torch.Tensor): The input position tensor.
+            seq_len (int): The sequence length.
+
+        Returns:
+            Tuple[torch.Tensor, torch.Tensor]: The precomputed frequencies for the given input position and sequence length.
+        """
+        assert (
+            input_pos is not None
+        ), "input_pos must be provided when use_kv_cache is True"
+        input_pos_item = input_pos[-1].item()
+
+        # CoreML partitioner is not picking up _check_is_size
+        # So instead use _check as workaround.  Should be easy fix for partitioner
+        # torch._check_is_size(input_pos_item)
+        torch._check(input_pos_item >= 0)
+        torch._check(input_pos_item + seq_len <= self.params.max_seq_len)
+        # pyre-ignore: Incompatible parameter type [6]: torch.narrow does expect int or Tensor
+        freqs_cos = self.freqs_cos.narrow(0, input_pos_item, seq_len)
+        # pyre-ignore: Incompatible parameter type [6]
+        freqs_sin = self.freqs_sin.narrow(0, input_pos_item, seq_len)
+
+        return freqs_cos, freqs_sin
+
+
+class FeedForward(nn.Module):
+    def __init__(self, args: ModelArgs):
+        super().__init__()
+        assert args.hidden_dim is not None
+        hidden_dim: int = args.hidden_dim
+        self.w1 = nn.Linear(args.dim, hidden_dim, bias=False)
+        self.w2 = nn.Linear(hidden_dim, args.dim, bias=False)
+        self.w3 = nn.Linear(args.dim, hidden_dim, bias=False)
+
+    def forward(self, x):
+        return self.w2(F.silu(self.w1(x)) * self.w3(x))
+
+
+class ConditionalFeedForward(nn.Module):
+    def __init__(self, args: ModelArgs):
+        super().__init__()
+        self.dim = args.dim
+        hidden_dim = args.hidden_dim
+        if hidden_dim is None:
+            # If hidden_dim is not explicitly set in the ModelArgs,
+            # then calculate implicitly based on dim and also multiple of `args.multiple_of`
+            multiple_of = args.multiple_of
+            hidden_dim = 4 * self.dim
+            hidden_dim = int(2 * hidden_dim / 3)
+            hidden_dim = multiple_of * ((hidden_dim + multiple_of - 1) // multiple_of)
+
+        self.w1 = nn.Parameter(torch.randn(args.num_experts, hidden_dim, self.dim))
+        self.w2 = nn.Parameter(torch.randn(args.num_experts, hidden_dim, self.dim))
+        self.w3 = nn.Parameter(torch.randn(args.num_experts, hidden_dim, self.dim))
+        self.num_experts = args.num_experts
+
+    def forward(self, x: torch.Tensor, expert_indices: torch.Tensor) -> torch.Tensor:
+        w1_weights = self.w1[expert_indices].transpose(-1, -2)  # [T, A, D, D]
+        w3_weights = self.w3[expert_indices].transpose(-1, -2)  # [T, A, D, D]
+        w2_weights = self.w2[expert_indices]  # [T, A, D, D]
+        x1 = F.silu(torch.einsum("ti,taio -> tao", x, w1_weights))
+        x3 = torch.einsum("ti, taio -> tao", x, w3_weights)
+        expert_outs = torch.einsum("tao, taoi -> tai", (x1 * x3), w2_weights)
+        return expert_outs
+
+
+class MOEFeedForward(nn.Module):
+    def __init__(self, config) -> None:
+        super().__init__()
+        self.gate = nn.Linear(config.dim, config.num_experts, bias=False)
+        self.cond_ffn = ConditionalFeedForward(config)
+        self.dim = config.dim
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = x.view(-1, self.dim)
+        # T = num_tokens, E = num_experts, D = hidden dim, A = activated experts
+        # x: [T, D]
+        scores = self.gate(x)  # [T, E]
+        expert_weights, expert_indices = torch.topk(scores, 2, dim=-1)  # [T, A], [T, A]
+        expert_weights = expert_weights.softmax(dim=-1)  # [T, A]
+        expert_outs = self.cond_ffn(x, expert_indices)
+        return torch.einsum("tai,ta -> ti", expert_outs, expert_weights)
+
+
+class Attention(nn.Module):
+    def __init__(self, args: ModelArgs, layer_id: int, rope: Rope):
+        super().__init__()
+        self.n_heads = args.n_heads
+        self.n_kv_heads = self.n_heads if args.n_kv_heads is None else args.n_kv_heads
+
+        assert self.n_heads % self.n_kv_heads == 0
+        model_parallel_size = 1
+        self.n_local_heads = self.n_heads // model_parallel_size
+        self.n_local_kv_heads = self.n_kv_heads // model_parallel_size
+        self.n_rep = self.n_local_heads // self.n_local_kv_heads
+        self.head_dim = args.head_dim
+        self.max_batch_size = args.max_batch_size
+        self.max_seq_len = args.max_seq_len
+        self.dim = args.dim
+        self.wq = nn.Linear(self.dim, self.n_heads * self.head_dim, bias=False)
+        self.wk = nn.Linear(self.dim, self.n_kv_heads * self.head_dim, bias=False)
+        self.wv = nn.Linear(self.dim, self.n_kv_heads * self.head_dim, bias=False)
+        self.wo = nn.Linear(self.n_heads * self.head_dim, self.dim, bias=False)
+
+        self.layer_id = layer_id
+
+        self.rope = rope
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        freqs_cos: torch.Tensor,
+        freqs_sin: torch.Tensor,
+        k_cache: torch.Tensor,
+        v_cache: torch.Tensor,
+        attn_mask: torch.Tensor,
+    ):
+        bsz, seqlen, _ = x.shape
+        # QKV
+        q, k, v = self.wq(x), self.wk(x), self.wv(x)
+        # We need view_copy elimination
+        q = q.view(bsz, seqlen, self.n_local_heads, self.head_dim)
+        k = k.view(bsz, seqlen, self.n_local_kv_heads, self.head_dim)
+        v = v.view(bsz, seqlen, self.n_local_kv_heads, self.head_dim)
+
+        # RoPE relative positional embeddings
+        q, k = self.rope.forward(q, k, freqs_cos, freqs_sin)
+
+        q = q.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)
+        k = k.transpose(1, 2)
+        v = v.transpose(1, 2)
+
+        new_k = k
+        new_v = v
+
+        k = torch.concat([k_cache, k], dim=2)
+        v = torch.concat([v_cache, v], dim=2)
+
+        # grouped multiquery attention: expand out keys and values
+        if self.n_rep > 1:
+            k = k.repeat_interleave(self.n_rep, dim=1)
+            v = v.repeat_interleave(self.n_rep, dim=1)
+
+        output = torch.ops.coreml.sdpa(q, k, v, attn_mask)
+
+        output = output.transpose(1, 2).contiguous().view(bsz, seqlen, -1)
+
+        output = self.wo(output)
+
+        return output, new_k, new_v
+
+
+class TransformerBlock(nn.Module):
+    def __init__(self, layer_id: int, args: ModelArgs, rope: Rope):
+        super().__init__()
+        self.n_heads = args.n_heads
+        self.dim = args.dim
+        self.head_dim = args.head_dim
+        self.attention = Attention(args, layer_id, rope)
+        if args.moe:
+            self.block_sparse_moe = MOEFeedForward(args)
+        else:
+            self.feed_forward = FeedForward(args)
+        self.attention_norm = RMSNorm(args.dim, eps=args.norm_eps)
+        self.ffn_norm = RMSNorm(args.dim, eps=args.norm_eps)
+
+    def forward(
+        self,
+        x,
+        freqs_cos,
+        freqs_sin,
+        k_cache,
+        v_cache,
+        attn_mask,
+    ):  # x: 1xN
+        norm_emb = self.attention_norm(x)
+        h, new_k, new_v = self.attention.forward(
+            norm_emb, freqs_cos, freqs_sin, k_cache, v_cache, attn_mask
+        )
+
+        h = x + h
+        out = h + self.feed_forward(self.ffn_norm(h))
+        return out, new_k, new_v
+
+
+class Transformer(nn.Module):
+    def __init__(self, params: ModelArgs):
+        super().__init__()
+        self.params = params
+        self.vocab_size = params.vocab_size
+        self.n_layers = params.n_layers
+
+        self.tok_embeddings = nn.Embedding(params.vocab_size, params.dim)
+        self.rope = Rope(params)
+        self.layers = torch.nn.ModuleList()
+        for layer_id in range(params.n_layers):
+            self.layers.append(TransformerBlock(layer_id, params, self.rope))
+        self.norm = RMSNorm(params.dim, eps=params.norm_eps)
+        self.output = nn.Linear(params.dim, params.vocab_size, bias=False)
+        self.generate_full_logits = params.generate_full_logits
+        self.max_seq_len = params.max_seq_len
+        self.input_prune_map = params.input_prune_map
+        self.output_prune_map = params.output_prune_map
+        self.use_cache_list = params.use_cache_list
+
+    def forward(
+        self,
+        tokens: torch.LongTensor,  # tokens
+        input_pos: torch.LongTensor,
+        input_length: torch.LongTensor,  # input_length
+        k_caches: List[torch.FloatTensor],
+        v_caches: List[torch.FloatTensor],
+        attn_mask: torch.LongTensor,
+        h: Optional[torch.FloatTensor] = None,  # embeddings
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        if (tokens is None) ^ (h is not None):
+            raise ValueError(
+                "You cannot specify both tokens and h at the same time, and must specify either one"
+            )
+        if tokens is not None and h is None:
+            h = self.tok_embeddings(tokens)
+        seqlen = h.shape[1]
+        freqs_cos, freqs_sin = self.rope.get_freqs(input_pos, seqlen)
+
+        k_out = []
+        v_out = []
+        for i, layer in enumerate(self.layers):
+            h, new_k, new_v = layer(
+                h,
+                freqs_cos,
+                freqs_sin,
+                k_caches[i] if self.use_cache_list else k_caches[i, :, :, :, :],
+                v_caches[i] if self.use_cache_list else v_caches[i, :, :, :, :],
+                attn_mask,
+            )
+            k_out.append(new_k)
+            v_out.append(new_v)
+
+        if not self.generate_full_logits:
+            # Only the last logit is used for the new generated token
+            h = h[:, input_length - 1, :].squeeze(1)
+
+        h = self.norm(h)
+
+        logits = self.output(h)
+
+        if not self.use_cache_list:
+            k_out = torch.stack(k_out, dim=0)
+            v_out = torch.stack(v_out, dim=0)
+        return logits, k_out, v_out
+
+
+class InputManager:
+    def __init__(
+        self,
+        n_layers: int,
+        max_batch_size: int,
+        n_kv_heads: int,
+        max_seq_length: int,
+        head_dim: int,
+        use_cache_list: bool,
+        seq_length: int,
+        dtype=torch.float16,
+        minus_infinity=-torch.inf,
+        cache_size=None,
+    ):
+        if cache_size is None:
+            cache_size = max_seq_length - seq_length
+        self.cache_size = cache_size
+        assert self.cache_size + seq_length <= max_seq_length
+
+        self.n_layers = n_layers
+        self.max_batch_size = max_batch_size
+        self.n_kv_heads = n_kv_heads
+        self.head_dim = head_dim
+
+        self.seq_length = seq_length
+        self.use_cache_list = use_cache_list
+
+        if self.use_cache_list:
+            self.k_caches = [
+                torch.zeros(self.get_cache_shape(self.cache_size)).to(dtype)
+                for _ in range(self.n_layers)
+            ]
+            self.v_caches = [
+                torch.zeros(self.get_cache_shape(self.cache_size)).to(dtype)
+                for _ in range(self.n_layers)
+            ]
+        else:
+            self.k_caches = torch.zeros(self.get_cache_shape(self.cache_size)).to(dtype)
+            self.v_caches = torch.zeros(self.get_cache_shape(self.cache_size)).to(dtype)
+
+        attn_cache = minus_infinity * torch.ones(
+            seq_length, self.cache_size
+        )  # attn for past tokens
+        attn_seq = torch.triu(
+            minus_infinity * torch.ones(self.seq_length, self.seq_length), diagonal=1
+        )  # attn for current tokens
+        self.attn_mask = torch.concat([attn_cache, attn_seq], dim=-1).to(dtype)
+        assert self.attn_mask.shape == (
+            self.seq_length,
+            self.cache_size + self.seq_length,
+        )
+
+        self.input_pos = 0
+        self.cache_pos = 0
+
+    def get_cache_shape(self, length):
+        if self.use_cache_list:
+            return (
+                self.max_batch_size,
+                self.n_kv_heads,
+                length,
+                self.head_dim,
+            )
+        return (
+            self.n_layers,
+            self.max_batch_size,
+            self.n_kv_heads,
+            length,
+            self.head_dim,
+        )
+
+    def _update_cache(self, start, length, new_k_caches, new_v_caches):
+        """
+        Copies new cache data from start to start + length to cache
+        """
+        assert self.cache_pos + length <= self.cache_size
+        assert start + length <= self.seq_length
+
+        if self.use_cache_list:
+            for i in range(self.n_layers):
+                assert new_k_caches[i].shape == self.get_cache_shape(self.seq_length)
+                assert new_v_caches[i].shape == self.get_cache_shape(self.seq_length)
+
+                self.k_caches[i][
+                    :, :, (self.cache_pos) : (self.cache_pos + length), :
+                ] = new_k_caches[i][:, :, start : (start + length), :]
+                self.v_caches[i][
+                    :, :, (self.cache_pos) : (self.cache_pos + length), :
+                ] = new_v_caches[i][:, :, start : (start + length), :]
+        else:
+            assert new_k_caches.shape == self.get_cache_shape(self.seq_length)
+            assert new_v_caches.shape == self.get_cache_shape(self.seq_length)
+            self.k_caches[:, :, :, (self.cache_pos) : (self.cache_pos + length), :] = (
+                new_k_caches[:, :, :, start : (start + length), :]
+            )
+            self.v_caches[:, :, :, (self.cache_pos) : (self.cache_pos + length), :] = (
+                new_v_caches[:, :, :, start : (start + length), :]
+            )
+
+        self.cache_pos += length
+        if self.cache_pos == self.cache_size:
+            self.cache_pos = 0
+
+    def update(self, input_length, new_k_caches, new_v_caches):
+        # Copy as much new cache data into cache as possible without wrapping
+        amount_to_copy = min(input_length, self.cache_size - self.cache_pos)
+        self._update_cache(0, amount_to_copy, new_k_caches, new_v_caches)
+        if self.input_pos <= self.cache_size:
+            self.attn_mask[:, (self.input_pos) : (self.input_pos + amount_to_copy)] = (
+                0.0
+            )
+
+        # Copy remainder (cache is now wrapped around and has more room)
+        # Attention mask needs no further updates.  Attention is paid to the whole cache
+        remaining_to_copy = min(
+            input_length - amount_to_copy, self.cache_size - self.cache_pos
+        )
+        if remaining_to_copy > 0:
+            self._update_cache(
+                amount_to_copy, remaining_to_copy, new_k_caches, new_v_caches
+            )
+
+        self.input_pos += input_length
+
+    def get_inputs(self, tokens: List[int]):
+        input_length = len(tokens)
+        assert input_length <= self.seq_length
+
+        return (
+            # tokens
+            torch.concat(
+                [
+                    torch.tensor(tokens, dtype=torch.int64),
+                    torch.zeros(self.seq_length - input_length, dtype=torch.int64),
+                ],
+                axis=-1,
+            ).reshape(1, -1),
+            # input_pos
+            torch.tensor([self.input_pos], dtype=torch.long),
+            # input_length
+            torch.tensor([input_length], dtype=torch.long),
+            # k_cache
+            self.k_caches,
+            # v_cache
+            self.v_caches,
+            # attn_mask
+            self.attn_mask,
+        )
+
+    def get_inputs_and_remaining_tokens(self, tokens: List[int]):
+        processed_tokens = min(self.seq_length, len(tokens))
+        return (
+            self.get_inputs(tokens[0:processed_tokens]),
+            tokens[processed_tokens:],
+        )
diff --git a/examples/apple/coreml/llama/readme.md b/examples/apple/coreml/llama/readme.md
new file mode 100644
index 0000000000..353f0b5630
--- /dev/null
+++ b/examples/apple/coreml/llama/readme.md
@@ -0,0 +1,39 @@
+# ANE-friendly Llama models
+
+This directory contains ANE-friendly Llama models.
+
+Export model with:
+```
+python export.py -n /path/to/output/model.pte -p /path/to/params.json -c /path/to/model.pth --seq_length 64 --max_seq_length 1024 --coreml-quantize c4w
+```
+
+(Note the script should be run from the executorch/examples/apple/coreml/llama directory.)
+
+The runner is written in python and is only intended to serve as an example for how the model inputs should be processed; it is not performant.
+
+
+Run model with:
+```
+python run.py -m /path/to/model.pte -t /path/to/tokenizer.model --prompt "Once upon a time,"
+```
+
+(Note the script should be run from the executorch/examples/apple/coreml/llama directory.)
+
+
+## Export args
+* seq_length: the number of tokens processed by the model.  Sequences shorter than seq_length must be padded, and sequences longer than it must be chunked.
+* max_seq_length: the maximum context tokens that can be processed.
+* cache_size: the size of the KV cache sequences.  This parameter is optional, and defaults to max_seq_length - seq_length.  If a smaller cache_size is used, older tokens are evicted from the cache and no longer play a role in attention.  For example, if max_seq_length=1024, but cache_size is 512, the model can generate up to 1024 tokens, but only the current tokens and the previous 512 will participate in attention.  In terms of computation, cache_size plays a similar role to max_seq_length in models without cache eviction.
+* use_cache_list: boolean option that controls whether KV caches are passed as a list of 4D tensors, one per layer, or if they are passed as one 5D tensor.  (Note that use_cache_list does not work with ExecuTorch pybindings.)
+* target_split_size: this option splits linear layers into chunks of target size.  For example, if target_split_size is 1024, a linear layer with (in_features=512, out_features=8096) will be split into 8 linear layers with (in_features=512, out_features=1024) and the results concatted.  If not specified, the default is no splitting.
+* max_splits: this controls the maximum number of splits for linear layers.  It is only relevant if target_size is passed and defaults to 8.
+
+## Llama1B on iPhone 15
+
+We are actively experimenting with different settings.  But here are ones that we've found work well for Llama1B on iPhone 15 Pro:
+
+* Set use_cache_list
+* Split linear layers with target_split_size=1024, max_splits=8
+* Use seq_length=32 or seq_length=64, both of which offer reasonable tradeoffs for prefill and decode performance.  seq_length=32 is better at decode and seq_length=64 is better at prefill.
+
+In our tests, we set max_seq_length=1024, but if your application allows for it, performance can improve with max_seq_length=512 or by keeping max_seq_length=1024 and setting cache_size=512-seq_length.
diff --git a/examples/apple/coreml/llama/run.py b/examples/apple/coreml/llama/run.py
new file mode 100644
index 0000000000..65026e1f6b
--- /dev/null
+++ b/examples/apple/coreml/llama/run.py
@@ -0,0 +1,134 @@
+import argparse
+import sys
+
+import sentencepiece as spm
+
+import torch
+
+from executorch.runtime import Runtime
+
+
+sys.path.insert(0, ".")
+from executorch.examples.models.llama.runner.generation import next_token
+from executorch.examples.models.llama.tokenizer import tiktoken
+from llama_transformer import InputManager
+
+
+class Tokenizer:
+    def __init__(self, model_path: str):
+        # Try sentence piece
+        try:
+            print("Trying to load sentencepiece")
+            sp = spm.SentencePieceProcessor()
+            sp.load(model_path)
+            self.tokenizer = sp
+        except:
+            print("Trying to load tiktoken")
+            self.tokenizer = tiktoken.Tokenizer(model_path)
+
+    def encode(self, text, bos, eos):
+        if isinstance(self.tokenizer, spm.SentencePieceProcessor):
+            bos_string = "<s>" if bos else ""
+            eos_string = "</s>" if eos else ""
+            return self.tokenizer.encode(f"{bos_string}{text}{eos_string}")
+        return self.tokenizer.encode(text, bos=bos, eos=eos)
+
+    def decode_token(self, token):
+        if isinstance(self.tokenizer, spm.SentencePieceProcessor):
+            return f"{self.tokenizer.decode(token)} "
+        return self.tokenizer.decode_token(token)
+
+    def stop_tokens(self):
+        if isinstance(self.tokenizer, spm.SentencePieceProcessor):
+            return [self.tokenizer.eos_id()]
+        return self.tokenizer.stop_tokens
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "-m",
+        "--model",
+        help="model.pte",
+    )
+    parser.add_argument(
+        "-t",
+        "--tokenizer",
+        help="tokenizer.model path",
+    )
+    parser.add_argument(
+        "--prompt",
+        type=str,
+        default="Once upon a time,",
+    )
+    parser.add_argument(
+        "--temperature",
+        type=float,
+        default=0.6,
+    )
+    parser.add_argument(
+        "--top_p",
+        type=float,
+        default=0.9,
+    )
+
+    args = parser.parse_args()
+
+    tokenizer = Tokenizer(args.tokenizer)
+
+    runtime = Runtime.get()
+    program = runtime.load_program(args.model)
+    method = program.load_method("forward")
+
+    metadata = method.metadata
+    print("Method metadata: ", metadata, "\n\n")
+
+    assert (
+        metadata.num_inputs() == 6
+    ), "Do not export with --use_cache_list for use in pybindings"
+    # k_cache input
+    n_layers, max_batch_size, n_kv_heads, cache_size, head_dim = (
+        metadata.input_tensor_meta(3).sizes()
+    )
+
+    # mask input
+    seq_length, max_seq_length = metadata.input_tensor_meta(5).sizes()
+
+    input_manager = InputManager(
+        n_layers=n_layers,
+        max_batch_size=max_batch_size,
+        n_kv_heads=n_kv_heads,
+        max_seq_length=max_seq_length,
+        head_dim=head_dim,
+        use_cache_list=False,
+        seq_length=seq_length,
+        dtype=torch.float16,
+        minus_infinity=-30000.0,
+        cache_size=cache_size,
+    )
+
+    print(args.prompt, end="")
+    tokens = tokenizer.encode(args.prompt, bos=True, eos=False)
+    while input_manager.input_pos + seq_length < max_seq_length:
+        while len(tokens) > 0 and (
+            input_manager.input_pos + seq_length < max_seq_length
+        ):
+            inputs, remaining_tokens = input_manager.get_inputs_and_remaining_tokens(
+                tokens
+            )
+            processed_tokens = len(tokens) - len(remaining_tokens)
+            logits, k, v = method.execute(inputs)
+            input_manager.update(
+                input_length=processed_tokens, new_k_caches=k, new_v_caches=v
+            )
+            tokens = remaining_tokens
+
+        tokens = [next_token(logits, args.temperature, args.top_p)]
+
+        if tokens[-1] in tokenizer.stop_tokens():
+            break
+        print(tokenizer.decode_token(tokens[-1]), end="", flush=True)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/arm/aot_arm_compiler.py b/examples/arm/aot_arm_compiler.py
index 33d8bc5ebf..f7f2105b99 100644
--- a/examples/arm/aot_arm_compiler.py
+++ b/examples/arm/aot_arm_compiler.py
@@ -185,7 +185,7 @@ def forward(self, x):
         return z
 
     example_input = (torch.ones(2, 2),)
-    can_delegate = False
+    can_delegate = True
 
 
 class MultipleOutputsModule(torch.nn.Module):
@@ -484,15 +484,15 @@ def get_args():  # noqa C901
     ):
         raise RuntimeError(f"Model {args.model_name} cannot be delegated.")
 
-    if args.system_config is None:
+    if "ethos-u" in args.target and args.system_config is None:
         if "u55" in args.target:
             args.system_config = "Ethos_U55_High_End_Embedded"
         elif "u85" in args.target:
-            args.system_confg = "Ethos_U85_SYS_DRAM_Mid"
+            args.system_config = "Ethos_U85_SYS_DRAM_Mid"
         else:
             raise RuntimeError(f"Invalid target name {args.target}")
 
-    if args.memory_mode is None:
+    if "ethos-u" in args.target and args.memory_mode is None:
         if "u55" in args.target:
             args.memory_mode = "Shared_Sram"
         elif "u85" in args.target:
@@ -591,6 +591,7 @@ def get_args():  # noqa C901
         output_name = os.path.join(args.output, output_name)
 
     save_pte_program(exec_prog, output_name)
+    print(f"PTE file saved as {output_name}.pte")
 
     if args.evaluate:
         evaluate_model(
diff --git a/examples/arm/run.sh b/examples/arm/run.sh
index 1a50f59d45..ce92312b65 100755
--- a/examples/arm/run.sh
+++ b/examples/arm/run.sh
@@ -9,15 +9,13 @@
 
 set -eu
 
-
-
 ########
 ### Hardcoded constants
 ########
 script_dir=$(cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd)
+et_root_dir=$(cd ${script_dir}/../.. && pwd)
+et_root_dir=$(realpath ${et_root_dir})
 
-# Default Ethos-u tool folder override with --scratch-dir=<FOLDER>
-root_dir=${script_dir}/ethos-u-scratch
 
 model_name=""
 aot_arm_compiler_flags="--delegate --quantize"
@@ -31,23 +29,26 @@ extra_build_flags=""
 build_only=false
 system_config=""
 memory_mode=""
+et_build_root="${et_root_dir}/arm_test"
+ethos_u_scratch_dir=${script_dir}/ethos-u-scratch
 
-help() {
+function help() {
     echo "Usage: $(basename $0) [options]"
     echo "Options:"
     echo "  --model_name=<MODEL>                   Model to run, can be a builtin, examples/models or a filename Default to all builtin models"
     echo "  --aot_arm_compiler_flags=<FLAGS>       Only used if --model_name is used Default: ${aot_arm_compiler_flags}"
     echo "  --portable_kernels=<OPS>               Comma separated list of portable (non delagated) kernels to include Default: ${portable_kernels}"
     echo "  --target=<TARGET>                      Target to build and run for Default: ${target}"
-    echo "  --output=<FOLDER>                      Output folder Default: ${output_folder}"
+    echo "  --output=<FOLDER>                      Target build output folder Default: ${output_folder}"
     echo "  --etdump                               Adds Devtools etdump support to track timing, etdump area will be base64 encoded in the log"
-    echo "  --debug_build                          Build with debug flag, default is Release"
-    echo "  --extra_build_flags                    Extra flags to pass to cmake like -DET_ARM_BAREMETAL_METHOD_ALLOCATOR_POOL_SIZE=60000 Default: none "
+    echo "  --build_type=<TYPE>                    Build with Release, Debug or RelWithDebInfo, default is ${build_type}"
+    echo "  --extra_build_flags=<FLAGS>            Extra flags to pass to cmake like -DET_ARM_BAREMETAL_METHOD_ALLOCATOR_POOL_SIZE=60000 Default: none "
     echo "  --build_only                           Only build, don't run FVP"
-    echo "  --scratch-dir=<FOLDER>                 Path to your Ethos-U scrach dir if you not using default"
     echo "  --system_config=<CONFIG>               System configuration to select from the Vela configuration file (see vela.ini). Default: Ethos_U55_High_End_Embedded for EthosU55 targets, Ethos_U85_SYS_DRAM_Mid for EthosU85 targets."
     echo "                                            NOTE: If given, this option must match the given target. This option also sets timing adapter values customized for specific hardware, see ./executor_runner/CMakeLists.txt."
     echo "  --memory_mode=<MODE>                   Memory mode to select from the Vela configuration file (see vela.ini), e.g. Shared_Sram/Sram_Only. Default: 'Shared_Sram' for Ethos-U55 targets, 'Sram_Only' for Ethos-U85 targets"
+    echo "  --et_build_root=<FOLDER>               Executorch build output root folder to use, defaults to ${et_build_root}"
+    echo "  --scratch-dir=<FOLDER>                 Path to your Ethos-U scrach dir if you not using default ${ethos_u_scratch_dir}"
     exit 0
 }
 
@@ -60,43 +61,26 @@ for arg in "$@"; do
       --target=*) target="${arg#*=}";;
       --output=*) output_folder="${arg#*=}" ; output_folder_set=true ;;
       --etdump) build_with_etdump=true ;;
-      --debug_build) build_type="Debug" ;;
+      --build_type=*) build_type="${arg#*=}";;
       --extra_build_flags=*) extra_build_flags="${arg#*=}";;
       --build_only) build_only=true ;;
-      --scratch-dir=*) root_dir="${arg#*=}";;
       --system_config=*) system_config="${arg#*=}";;
       --memory_mode=*) memory_mode="${arg#*=}";;
+      --et_build_root=*) et_build_root="${arg#*=}";;
+      --scratch-dir=*) ethos_u_scratch_dir="${arg#*=}";;
       *)
       ;;
     esac
 done
 
-root_dir=$(realpath ${root_dir})
-output_folder=$(realpath ${output_folder})
-mkdir -p ${output_folder}
-if [ "$output_folder_set" = true ] ; then
-    executor_runner_path=${output_folder}
-else
-    executor_runner_path=${script_dir}/executor_runner
-fi
-executor_runner_path=$(realpath ${executor_runner_path})
-
-mkdir -p ${root_dir}/ethos-u
-ethos_u_root_dir="$(cd ${root_dir}/ethos-u && pwd)"
-setup_path_script=${root_dir}/setup_path.sh
+# Default Ethos-u tool folder override with --scratch-dir=<FOLDER>
+ethos_u_scratch_dir=$(realpath ${ethos_u_scratch_dir})
+setup_path_script=${ethos_u_scratch_dir}/setup_path.sh
+toolchain_cmake=${script_dir}/ethos-u-setup/arm-none-eabi-gcc.cmake
+_setup_msg="please refer to ${script_dir}/setup.sh to properly install necessary tools."
 
-# Executorch
-et_root_dir=$(cd ${script_dir}/../.. && pwd)
-et_build_dir=${et_root_dir}/cmake-out
 
 # Set target based variables
-fvp_model=FVP_Corstone_SSE-300_Ethos-U55
-if [[ ${target} =~ "ethos-u85" ]]
-then
-    echo "target is ethos-u85 variant so switching to CS320 FVP"
-    fvp_model=FVP_Corstone_SSE-320
-fi
-
 if [[ ${system_config} == "" ]]
 then
     system_config="Ethos_U55_High_End_Embedded"
@@ -115,227 +99,6 @@ then
     fi
 fi
 
-toolchain_cmake=${script_dir}/ethos-u-setup/arm-none-eabi-gcc.cmake
-_setup_msg="please refer to ${script_dir}/ethos-u-setup/setup.sh to properly install necessary tools."
-
-if ! [[ $portable_kernels =~ ^((^|,)aten::[a-zA-Z0-9_]+\.[a-zA-Z0-9_]*out)*$ ]]; then
-    echo " ERROR: specified argument --portable_kernels=${portable_kernels}"
-    echo "        is in the wrong format please use \"aten::<OP1>.out,aten::<OP2>.out,...\""
-    echo "        e.g. \"aten::_softmax.out,aten::add.out\""
-    exit 1
-fi
-
-# Generate a pte file
-# output from this function is the pte filename e.g. echo should be avoided or directed to stderr e.g. >&2
-function generate_pte_file() {
-    [[ $# -ne 2 ]] && { echo "[${FUNCNAME[0]}]" "Expecting model and model_compiler_flags flag, got, $*"; exit 1; }
-    local model=${1}
-    local model_short_name=$(basename -- "${model}" ".py")
-    local model_compiler_flags=${2}
-
-    local model_filename=${model_short_name}_arm_${target}.pte
-    if [[ "${model_compiler_flags}" == *"--delegate"* ]]; then
-        # Name aligned with default aot_arm_compiler output
-        model_filename=${model_short_name}_arm_delegate_${target}.pte
-    fi
-    cd $et_root_dir
-
-    local pte_file
-    pte_file=$(realpath ${output_folder}/${model_filename})
-    rm -f "${pte_file}"
-
-    SO_EXT=$(python3 -c 'import platform; print({"Darwin": "dylib", "Linux": "so", "Windows": "dll"}.get(platform.system(), None))')
-    # We are using the aot_lib from build_quantization_aot_lib below
-    SO_LIB=$(find cmake-out-aot-lib -name libquantized_ops_aot_lib.${SO_EXT})
-
-    local ARM_AOT_CMD="python3 -m examples.arm.aot_arm_compiler --model_name=${model} --target=${target} ${model_compiler_flags} --output ${output_folder} --so_library=$SO_LIB --system_config=${system_config} --memory_mode=${memory_mode}"
-    echo "CALL ${ARM_AOT_CMD}" >&2
-    ${ARM_AOT_CMD} 1>&2
-
-    [[ -f ${pte_file} ]] || { >&2 echo "Failed to generate a pte file - ${pte_file}"; exit 1; }
-    echo "${pte_file}"
-}
-
-# build ExecuTorch Libraries
-function build_executorch() {
-    set -x
-
-    [[ -d "${et_build_dir}" ]] \
-        && echo "[${FUNCNAME[0]}] Warn: using already existing build-dir for executorch: ${et_build_dir}!!"
-    mkdir -p "${et_build_dir}"
-
-    cd "${et_root_dir}"
-
-    build_with_etdump_flags=""
-    if [ "$build_with_etdump" = true ] ; then
-        ( set +x ;
-            echo "--------------------------------------------------------------------------------" ;
-            echo "Build ExecuTorch Libraries host flatcc bin ${build_type} into ${et_root_dir} - cmake-out-host-tools/bin/flatcc" ;
-            echo "--------------------------------------------------------------------------------" )
-
-
-        # Build host flatcc bin
-        mkdir -p cmake-out-host-tools
-        cmake                                                 \
-            -DCMAKE_INSTALL_PREFIX=${et_build_dir}            \
-            -DCMAKE_BUILD_TYPE=${build_type}                  \
-            -DEXECUTORCH_BUILD_EXECUTOR_RUNNER=OFF            \
-            -DEXECUTORCH_ENABLE_LOGGING=ON                    \
-            -DEXECUTORCH_BUILD_ARM_BAREMETAL=ON               \
-            -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON           \
-            -DEXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL=ON       \
-            -DEXECUTORCH_BUILD_DEVTOOLS=ON                    \
-            -DEXECUTORCH_ENABLE_EVENT_TRACER=ON               \
-            -DEXECUTORCH_SEPARATE_FLATCC_HOST_PROJECT=ON      \
-            -DFLATCC_ALLOW_WERROR=OFF                         \
-            -DFLATC_EXECUTABLE="$(which flatc)"               \
-            ${extra_build_flags}                              \
-            -Bcmake-out-host-tools                            \
-            "${et_root_dir}"
-
-        mkdir -p cmake-out-host-tools/bin
-        cp third-party/flatcc/bin/flatcc cmake-out-host-tools/bin
-
-        build_with_etdump_flags="-DEXECUTORCH_BUILD_DEVTOOLS=ON                    \
-                                 -DEXECUTORCH_ENABLE_EVENT_TRACER=ON               \
-                                 -DEXECUTORCH_SEPARATE_FLATCC_HOST_PROJECT=OFF     \
-                                 -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=OFF      \
-                                 -DFLATCC_ALLOW_WERROR=OFF                         \
-                                 -DFLATCC_EXECUTABLE=${et_root_dir}/cmake-out-host-tools/bin/flatcc "
-    fi
-
-    ( set +x ;
-        echo "--------------------------------------------------------------------------------" ;
-        echo "Build ExecuTorch Libraries target libs with --target install ${build_type} into '${et_root_dir}' - '${et_build_dir}'" ;
-        echo "--------------------------------------------------------------------------------" )
-
-    # Build
-    cmake                                                 \
-        -DCMAKE_INSTALL_PREFIX=${et_build_dir}            \
-        -DCMAKE_BUILD_TYPE=${build_type}                  \
-        -DCMAKE_TOOLCHAIN_FILE="${toolchain_cmake}"       \
-        -DEXECUTORCH_BUILD_EXECUTOR_RUNNER=OFF            \
-        -DEXECUTORCH_BUILD_ARM_BAREMETAL=ON               \
-        -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON           \
-        -DEXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL=ON       \
-        -DEXECUTORCH_ENABLE_LOGGING=ON                    \
-        ${build_with_etdump_flags}                        \
-        -DFLATC_EXECUTABLE="$(which flatc)"               \
-        ${extra_build_flags}                              \
-        -B${et_build_dir}                                 \
-        "${et_root_dir}"
-
-    echo "[${FUNCNAME[0]}] Configured CMAKE"
-
-    cmake --build ${et_build_dir} --parallel --target install --config ${build_type} --
-
-    ( set +x ;
-        echo "--------------------------------------------------------------------------------" ;
-        echo "Build ExecuTorch Libraries ${build_type} into '${et_root_dir}/examples/arm' - '${et_build_dir}/examples/arm'" ;
-        echo "--------------------------------------------------------------------------------" )
-
-    cmake                                                 \
-        -DCMAKE_INSTALL_PREFIX=${et_build_dir}            \
-        -DCMAKE_BUILD_TYPE=${build_type}                  \
-        -DCMAKE_TOOLCHAIN_FILE="${toolchain_cmake}"       \
-        -DEXECUTORCH_SELECT_OPS_LIST=${portable_kernels}  \
-        -DEXECUTORCH_BUILD_ARM_BAREMETAL=ON               \
-        ${extra_build_flags}                              \
-        -B"${et_build_dir}/examples/arm"                  \
-        "${et_root_dir}/examples/arm"
-
-    cmake --build "${et_build_dir}/examples/arm" --parallel --config ${build_type} --
-
-    set +x
-
-    cd "${et_build_dir}"
-    echo "[${FUNCNAME[0]}] Generated static libraries for ExecuTorch:"
-    find . -name "*.a" -exec ls -al {} \;
-}
-
-# build Arm Baremetal executor_runner
-function build_executorch_runner() {
-    echo "[${FUNCNAME[0]}] Generating ExecuTorch libraries"
-    [[ $# -ne 1 ]] && { echo "[${FUNCNAME[0]}]" "Expecting a single pte file as argument got, $*"; exit 1; }
-    local pte=${1}
-    if [[ ${target} == *"ethos-u55"*  ]]; then
-        local target_cpu=cortex-m55
-    else
-        local target_cpu=cortex-m85
-    fi
-    echo "--------------------------------------------------------------------------------"
-    echo "Build Arm Baremetal executor_runner for ${target} - '${executor_runner_path}/cmake-out'"
-    echo "--------------------------------------------------------------------------------"
-
-    cd ${script_dir}/executor_runner
-
-    build_with_etdump_flags=""
-    if [ "$build_with_etdump" = true ] ; then
-        build_with_etdump_flags=" -DEXECUTORCH_ENABLE_EVENT_TRACER=ON "
-    fi
-
-    cmake \
-      -DCMAKE_BUILD_TYPE=${build_type}            \
-      -DCMAKE_TOOLCHAIN_FILE=${toolchain_cmake}   \
-      -DTARGET_CPU=${target_cpu}                  \
-      -DET_DIR_PATH:PATH=${et_root_dir}           \
-      -DET_BUILD_DIR_PATH:PATH=${et_build_dir}    \
-      -DET_PTE_FILE_PATH:PATH="${pte}"            \
-      -DETHOS_SDK_PATH:PATH=${ethos_u_root_dir}   \
-      -DETHOSU_TARGET_NPU_CONFIG=${target}        \
-      ${build_with_etdump_flags}                  \
-      -DPYTHON_EXECUTABLE=$(which python3)        \
-      -DSYSTEM_CONFIG=${system_config}            \
-      ${extra_build_flags}                        \
-      -B ${executor_runner_path}/cmake-out
-
-    echo "[${FUNCNAME[0]}] Configured CMAKE"
-
-    cmake --build ${executor_runner_path}/cmake-out --parallel -- arm_executor_runner
-    echo "[${FUNCNAME[0]}] Generated baremetal elf file:"
-    find ${executor_runner_path}/cmake-out -name "arm_executor_runner"
-    echo "executable_text: $(find ${executor_runner_path}/cmake-out -name arm_executor_runner -exec arm-none-eabi-size {} \; | grep -v filename | awk '{print $1}') bytes"
-    echo "executable_data: $(find ${executor_runner_path}/cmake-out -name arm_executor_runner -exec arm-none-eabi-size {} \; | grep -v filename | awk '{print $2}') bytes"
-    echo "executable_bss:  $(find ${executor_runner_path}/cmake-out -name arm_executor_runner -exec arm-none-eabi-size {} \; | grep -v filename | awk '{print $3}') bytes"
-}
-
-# Execute the executor_runner on FVP Simulator
-function run_fvp() {
-    [[ $# -ne 1 ]] && { echo "[${FUNCNAME[0]}]" "Expexted elf binary name, got $*"; exit 1; }
-    local elf_name=${1}
-    elf=$(find ${executor_runner_path} -name "${elf_name}")
-    [[ ! -f $elf ]] && { echo "[${FUNCNAME[0]}]: Unable to find executor_runner elf: ${elf}"; exit 1; }
-    num_macs=$(echo ${target} | cut -d - -f 3)
-
-    if [[ ${target} == *"ethos-u55"*  ]]; then
-        echo "Running ${elf} for ${target} run with FVP:${fvp_model} num_macs:${num_macs}"
-        ${fvp_model}                                            \
-            -C ethosu.num_macs=${num_macs}                      \
-            -C mps3_board.visualisation.disable-visualisation=1 \
-            -C mps3_board.telnetterminal0.start_telnet=0        \
-            -C mps3_board.uart0.out_file='-'                    \
-            -C mps3_board.uart0.shutdown_on_eot=1               \
-            -a "${elf}"                                         \
-            --timelimit 220 || true # seconds
-        echo "[${FUNCNAME[0]}] Simulation complete, $?"
-    elif [[ ${target} == *"ethos-u85"*  ]]; then
-        echo "Running ${elf} for ${target} run with FVP:${fvp_model} num_macs:${num_macs}"
-    	${fvp_model}                                            \
-            -C mps4_board.subsystem.ethosu.num_macs=${num_macs} \
-            -C mps4_board.visualisation.disable-visualisation=1 \
-            -C vis_hdlcd.disable_visualisation=1                \
-            -C mps4_board.telnetterminal0.start_telnet=0        \
-            -C mps4_board.uart0.out_file='-'                    \
-            -C mps4_board.uart0.shutdown_on_eot=1               \
-            -a "${elf}"                                         \
-            --timelimit 220 || true # seconds
-        echo "[${FUNCNAME[0]}] Simulation complete, $?"
-    else
-        echo "Running ${elf} for ${target} is not supported"
-        exit 1
-    fi
-}
-
 #######
 ### Main
 #######
@@ -343,12 +106,10 @@ function run_fvp() {
 # This should be prepared by the setup.sh
 [[ -f ${setup_path_script} ]] \
     || { echo "Missing ${setup_path_script}. ${_setup_msg}"; exit 1; }
-source ${root_dir}/setup_path.sh
 
-# basic checks before we get started
-hash ${fvp_model} \
-    || { echo "Could not find ${fvp_model} on PATH, ${_setup_msg}"; exit 1; }
+source ${setup_path_script}
 
+# basic checks before we get started
 hash arm-none-eabi-gcc \
     || { echo "Could not find arm baremetal toolchain on PATH, ${_setup_msg}"; exit 1; }
 
@@ -358,9 +119,24 @@ hash arm-none-eabi-gcc \
 [[ -f ${et_root_dir}/CMakeLists.txt ]] \
     || { echo "Executorch repo doesn't contain CMakeLists.txt file at root level"; exit 1; }
 
-# build executorch libraries
-build_executorch
-cd $et_root_dir && backends/arm/scripts/build_quantized_ops_aot_lib.sh $build_type
+# Build executorch libraries
+cd $et_root_dir
+if [ "$build_with_etdump" = true ] ; then
+    et_dump_flag="--etdump"
+else
+    et_dump_flag=""
+fi
+
+backends/arm/scripts/build_executorch.sh --et_build_root="${et_build_root}" --build_type=$build_type $et_dump_flag
+backends/arm/scripts/build_portable_kernels.sh --et_build_root="${et_build_root}" --build_type=$build_type --portable_kernels=$portable_kernels
+
+# Build a lib quantized_ops_aot_lib
+backends/arm/scripts/build_quantized_ops_aot_lib.sh --et_build_root="${et_build_root}" --build_type=$build_type
+
+SO_EXT=$(python3 -c 'import platform; print({"Darwin": "dylib", "Linux": "so", "Windows": "dll"}.get(platform.system(), None))')
+# We are using the aot_lib from build_quantization_aot_lib below
+SO_LIB=$(find "${et_build_root}/cmake-out-aot-lib" -name libquantized_ops_aot_lib.${SO_EXT})
+
 
 if [[ -z "$model_name" ]]; then
     # the test models run, and whether to delegate
@@ -373,19 +149,51 @@ fi
 
 # loop over running the AoT flow and executing the model on device
 for i in "${!test_model[@]}"; do
+    model="${test_model[i]}"
+    model_compiler_flags="${model_compiler_flags[i]}"
+
     echo "--------------------------------------------------------------------------------"
-    printf "Running e2e flow for model '%s' with flags '%s'\n" "${test_model[i]}" "${model_compiler_flags[i]}"
+    printf "Running e2e flow for model '%s' with flags '%s'\n" "${model}" "${model_compiler_flags}"
     echo "--------------------------------------------------------------------------------"
-    pte=$(generate_pte_file "${test_model[i]}" "${model_compiler_flags[i]}")
-    stat --printf="Generated pte_data_size: %s bytes\npte_file:%n\n" ${pte}
+
+    cd $et_root_dir
+    model_short_name=$(basename -- "${model}" ".py")
+    model_filename=${model_short_name}_arm_${target}.pte
+
+    if [[ "${model_compiler_flags}" == *"--delegate"* ]]; then
+        # Name aligned with default aot_arm_compiler output
+        model_filename=${model_short_name}_arm_delegate_${target}.pte
+    fi
+
+    if [ "$output_folder_set" = false ] ; then
+        output_folder=${et_build_root}/${model_short_name}
+    fi
+
+    output_folder=$(realpath ${output_folder})
+    mkdir -p ${output_folder}
+    pte_file=$(realpath -m ${output_folder}/${model_filename})
+
+    rm -f "${pte_file}"
+
+    ARM_AOT_CMD="python3 -m examples.arm.aot_arm_compiler --model_name=${model} --target=${target} ${model_compiler_flags} --intermediate=${output_folder} --output=${output_folder} --so_library=$SO_LIB --system_config=${system_config} --memory_mode=${memory_mode}"
+    echo "CALL ${ARM_AOT_CMD}" >&2
+    ${ARM_AOT_CMD} 1>&2
+
+    [[ -f ${pte_file} ]] || { >&2 echo "Failed to generate a pte file - ${pte_file}"; exit 1; }
+    echo "pte_data_size: $(wc -c ${pte_file})"
+    echo "pte_file: ${pte_file}"
+
     if [[ ${target} == *"TOSA"*  ]]; then
-        echo "Build for ${target} skip generating .elf and running"
+        echo "Build for ${target} skip generating a .elf and running it"
     else
+        set -x
         # Rebuild the application as the pte is imported as a header/c array
-        build_executorch_runner "${pte}"
+        backends/arm/scripts/build_executorch_runner.sh "--pte=${pte_file}" --build_type=$build_type --target=$target --system_config=$system_config  $et_dump_flag --extra_build_flags="$extra_build_flags" --ethosu_tools_dir="$ethos_u_scratch_dir" --output="${output_folder}"
         if [ "$build_only" = false ] ; then
-            run_fvp arm_executor_runner
+            # Execute the executor_runner on FVP Simulator
+            backends/arm/scripts/run_fvp.sh --elf=${output_folder}/cmake-out/arm_executor_runner --target=$target
         fi
+        set +x
     fi
 done
 
diff --git a/examples/arm/setup.sh b/examples/arm/setup.sh
index 800dfb8d6d..8cfacbd374 100755
--- a/examples/arm/setup.sh
+++ b/examples/arm/setup.sh
@@ -65,8 +65,7 @@ tosa_reference_model_rev="70ed0b40fa831387e36abdb4f7fb9670a3464f5a"
 
 # vela
 vela_repo_url="https://gitlab.arm.com/artificial-intelligence/ethos-u/ethos-u-vela"
-vela_rev="e131bf4f528f0d461868229972e07f371dcbc881"
-
+vela_rev="46d88f56902be0706e051c10153ffb7620e01ee3"
 
 ########
 ### Optional user args
diff --git a/examples/models/llama/TARGETS b/examples/models/llama/TARGETS
index ad97820524..489d42c29c 100644
--- a/examples/models/llama/TARGETS
+++ b/examples/models/llama/TARGETS
@@ -95,11 +95,11 @@ runtime.command_alias(
 )
 
 runtime.python_library(
-    name = "export_library",
+    name = "source_transformation",
+    visibility = [
+        "//executorch/examples/...",
+    ],
     srcs = [
-        "export_llama.py",
-        "export_llama_lib.py",
-        "model.py",
         "source_transformation/apply_spin_quant_r1_r2.py",
         "source_transformation/attention.py",
         "source_transformation/lora.py",
@@ -114,6 +114,15 @@ runtime.python_library(
         "source_transformation/vulkan_rope.py",
         "source_transformation/attention_sink.py",
     ],
+)
+
+runtime.python_library(
+    name = "export_library",
+    srcs = [
+        "export_llama.py",
+        "export_llama_lib.py",
+        "model.py",
+    ],
     _is_external_target = True,
     base_module = "executorch.examples.models.llama",
     visibility = [
@@ -123,6 +132,7 @@ runtime.python_library(
         "@EXECUTORCH_CLIENTS",
     ],
     deps = [
+        ":source_transformation",
         "//ai_codesign/gen_ai/fast_hadamard_transform:fast_hadamard_transform",
         "//caffe2:torch",
         "//executorch/backends/vulkan/_passes:vulkan_passes",
diff --git a/examples/models/llama/static_attention.py b/examples/models/llama/static_attention.py
index 8b341a3aaf..3a5f88ad3f 100644
--- a/examples/models/llama/static_attention.py
+++ b/examples/models/llama/static_attention.py
@@ -47,19 +47,29 @@ def calculate_cache_key(layer_id: int, head_id: int) -> str:
         return f"l{layer_id},h{head_id}"
 
     @staticmethod
-    def apply_update(cache, update, transpose=False):
+    def apply_update(cache, update, pos, style, transpose=False):
         """
         After inference, update the cache state for next iteration. The runtime needs to
         implement the same operation.
         """
-        if transpose:
-            update_len = update.size(-1)
-            updated = torch.roll(cache, -update_len, -1)
-            updated[:, :, -update_len:] = update
-        else:
-            update_len = update.size(-2)
-            updated = torch.roll(cache, -update_len, -2)
-            updated[:, -update_len:, :] = update
+        if style == "shift_pointer":
+            if transpose:
+                update_len = update.size(-1)
+                updated = torch.roll(cache, -update_len, -1)
+                updated[:, :, -update_len:] = update
+            else:
+                update_len = update.size(-2)
+                updated = torch.roll(cache, -update_len, -2)
+                updated[:, -update_len:, :] = update
+
+        if style == "smart_mask":
+            updated = torch.clone(cache)
+            if transpose:
+                update_len = update.size(-1)
+                updated[:, :, pos : pos + update_len] = update
+            else:
+                update_len = update.size(-2)
+                updated[:, pos : pos + update_len, :] = update
 
         return updated
 
@@ -114,15 +124,67 @@ def update(
         return all_data, (out_k_cache, out_v_cache)
 
 
-def _apply_rotary_embedding(
-    x: torch.Tensor, freqs_cos: torch.Tensor, freqs_sin: torch.Tensor
-) -> torch.Tensor:
-    x_r, x_i = x[..., ::2], x[..., 1::2]
-    x_out_r = x_r * freqs_cos - x_i * freqs_sin
-    x_out_i = x_r * freqs_sin + x_i * freqs_cos
+class StaticAttentionMask:
+    def __init__(self, input_len, cache_len, style):
+        self.input_len = input_len
+        self.cache_len = cache_len
+        assert style in ("shift_pointer", "smart_mask")
+        self.style = style
+        self.unmasked_len = 0
+        self.tensor = torch.zeros(1, input_len, input_len + cache_len)
+        self.reset()
+
+    def reset(self):
+        self.unmasked_len = 0
+        self.tensor[:, :, : self.cache_len] = float("-inf")
+
+    def unmask(self, new_unmasked_len):
+        if new_unmasked_len <= 0:
+            return
+
+        if self.style == "shift_pointer":
+            self.tensor[
+                :,
+                :,
+                self.cache_len
+                - self.unmasked_len
+                - new_unmasked_len : self.cache_len
+                - self.unmasked_len,
+            ] = 0
+
+        if self.style == "smart_mask":
+            self.tensor[
+                :,
+                :,
+                self.unmasked_len : self.unmasked_len + new_unmasked_len,
+            ] = 0
+
+        self.unmasked_len += new_unmasked_len
+
+
+class _Rope(nn.Module):
+    def __init__(self, use_hf_rope):
+        super().__init__()
+        self.use_hf_rope = use_hf_rope
 
-    x_out = torch.cat([x_out_r, x_out_i], dim=-1)
-    return x_out
+    def forward(
+        self, x: torch.Tensor, freqs_cos: torch.Tensor, freqs_sin: torch.Tensor
+    ) -> torch.Tensor:
+        if self.use_hf_rope:
+            if len(freqs_cos.shape) == 2:
+                freqs_cos = freqs_cos.unsqueeze(0)
+            if len(freqs_sin.shape) == 2:
+                freqs_sin = freqs_sin.unsqueeze(0)
+            x1 = x[..., : x.shape[-1] // 2]
+            x2 = x[..., x.shape[-1] // 2 :]
+            x_rotated = torch.cat((-x2, x1), dim=-1)
+            return x * freqs_cos + x_rotated * freqs_sin
+        else:
+            x_r, x_i = x[..., ::2], x[..., 1::2]
+            x_out_r = x_r * freqs_cos - x_i * freqs_sin
+            x_out_i = x_r * freqs_sin + x_i * freqs_cos
+            x_out = torch.cat([x_out_r, x_out_i], dim=-1)
+            return x_out
 
 
 @register_attention("static")
@@ -172,6 +234,7 @@ def __init__(self, config: ModelArgs, layer_id: int, rope: Rope):
             [StaticVCache(layer_id, i) for i in range(self.n_kv_heads)]
         )
         self.wo = nn.Linear(self.n_heads * self.head_dim, self.dim, bias=False)
+        self.rope = _Rope(rope.params.use_hf_rope)
 
     def forward(
         self,
@@ -191,8 +254,8 @@ def forward(
         new_qs = [self.wqs[i](x) for i in range(self.n_heads)]
         new_ks = [self.wks[i](x) for i in range(self.n_kv_heads)]
         new_vs = [self.wvs[i](x) for i in range(self.n_kv_heads)]
-        new_qs = [_apply_rotary_embedding(q, freqs_cos, freqs_sin) for q in new_qs]
-        new_ks = [_apply_rotary_embedding(k, freqs_cos, freqs_sin) for k in new_ks]
+        new_qs = [self.rope(q, freqs_cos, freqs_sin) for q in new_qs]
+        new_ks = [self.rope(k, freqs_cos, freqs_sin) for k in new_ks]
 
         all_ks = []
         all_vs = []
@@ -211,7 +274,7 @@ def forward(
             kv_idx = i // self.n_heads_per_kv_group
             attn = new_qs[i] @ all_ks[kv_idx].transpose(-2, -1)
             attn = attn * self.inv_scale
-            attn = attn + mask  # pyre-ignore
+            attn = attn + mask
             attn = F.softmax(attn, dim=-1)
             heads.append(attn @ all_vs[kv_idx])
 
diff --git a/examples/models/llama/tests/test_static_attention.py b/examples/models/llama/tests/test_static_attention.py
index 401ba604cd..45364b1d5e 100644
--- a/examples/models/llama/tests/test_static_attention.py
+++ b/examples/models/llama/tests/test_static_attention.py
@@ -7,6 +7,7 @@
 from executorch.examples.models.llama.rope import Rope
 from executorch.examples.models.llama.static_attention import (
     StaticAttention,
+    StaticAttentionMask,
     StaticKVCache,
 )
 
@@ -43,6 +44,35 @@ def test_without_cache(self):
         )
         self.assertTrue(torch.isclose(y, expected, rtol=1e-3).all())
 
+    def test_hf_rope_without_cache(self):
+        config = ModelArgs(
+            dim=64,
+            n_heads=4,
+            n_kv_heads=2,
+            max_seq_len=8,
+            use_hf_rope=True,
+        )
+        layer_id = 0
+        rope = Rope(config)
+        attn_mha = AttentionMHA(config, layer_id, rope).eval()
+        static_attn = StaticAttention(config, layer_id, rope).eval()
+        static_attn.load_weights_from_attention_mha(attn_mha)
+
+        x = torch.rand(1, config.max_seq_len, config.dim)
+        freqs_cos, freqs_sin = rope.get_freqs(None, config.max_seq_len)
+        expected, _ = attn_mha(x, freqs_cos, freqs_sin)
+        mask = torch.triu(
+            torch.full((1, config.max_seq_len, config.max_seq_len), float("-inf")),
+            diagonal=1,
+        )
+        y, _ = static_attn(
+            x,
+            freqs_cos.unsqueeze(0),
+            freqs_sin.unsqueeze(0),
+            mask=mask,
+        )
+        self.assertTrue(torch.isclose(y, expected, rtol=1e-3).all())
+
     def test_with_cache(self):
         config = ModelArgs(
             dim=64,
@@ -63,48 +93,54 @@ def test_with_cache(self):
         n_chunks = 3
         chunk_len = config.max_seq_len // n_chunks
         cache_len = config.max_seq_len - chunk_len
-        mask = torch.zeros(1, chunk_len, cache_len + chunk_len)
-        mask[:, :, :cache_len] = float("-inf")
-        mask[:, :, cache_len:] = torch.triu(
-            torch.full((1, chunk_len, chunk_len), float("-inf")),
-            diagonal=1,
-        )
-        k_caches = {
-            StaticKVCache.calculate_cache_key(layer_id, i): torch.zeros(
-                1, cache_len, config.head_dim
-            )
-            for i in range(config.n_kv_heads)
-        }
-        v_caches = {
-            StaticKVCache.calculate_cache_key(layer_id, i): torch.zeros(
-                1, cache_len, config.head_dim
-            )
-            for i in range(config.n_kv_heads)
-        }
-        ys = []
-        for i in range(n_chunks):
-            y_i, attn_update = static_attn(
-                x[:, i * chunk_len : (i + 1) * chunk_len, :],
-                freqs_cos[i * chunk_len : (i + 1) * chunk_len],
-                freqs_sin[i * chunk_len : (i + 1) * chunk_len],
-                mask=mask,
-                in_cache_state=(k_caches, v_caches),
-                out_cache_state=({}, {}),
+
+        def test_with_style(style):
+            mask = StaticAttentionMask(chunk_len, cache_len, style=style)
+            mask.tensor[:, :, cache_len:] = torch.triu(
+                torch.full((1, chunk_len, chunk_len), float("-inf")),
+                diagonal=1,
             )
-            ys.append(y_i)
-            mask[:, :, cache_len - chunk_len * (i + 1) : cache_len] = 0
-            k_cache_updates, v_cache_updates = attn_update["out_cache_state"]
-            for cache_id, update in k_cache_updates.items():
-                k_caches[cache_id] = StaticKVCache.apply_update(
-                    k_caches[cache_id], update
+            k_caches = {
+                StaticKVCache.calculate_cache_key(layer_id, i): torch.zeros(
+                    1, cache_len, config.head_dim
                 )
-            for cache_id, update in v_cache_updates.items():
-                v_caches[cache_id] = StaticKVCache.apply_update(
-                    v_caches[cache_id], update
+                for i in range(config.n_kv_heads)
+            }
+            v_caches = {
+                StaticKVCache.calculate_cache_key(layer_id, i): torch.zeros(
+                    1, cache_len, config.head_dim
                 )
+                for i in range(config.n_kv_heads)
+            }
+            ys = []
+            for i in range(n_chunks):
+                y_i, attn_update = static_attn(
+                    x[:, i * chunk_len : (i + 1) * chunk_len, :],
+                    freqs_cos[i * chunk_len : (i + 1) * chunk_len],
+                    freqs_sin[i * chunk_len : (i + 1) * chunk_len],
+                    mask=mask.tensor,
+                    in_cache_state=(k_caches, v_caches),
+                    out_cache_state=({}, {}),
+                )
+                ys.append(y_i)
+                mask.unmask(chunk_len)
+                k_cache_updates, v_cache_updates = attn_update["out_cache_state"]
 
-        y = torch.cat(ys, dim=1)
-        self.assertTrue(torch.isclose(y, expected, rtol=1e-3).all())
+                if i < n_chunks - 1:
+                    for cache_id, update in k_cache_updates.items():
+                        k_caches[cache_id] = StaticKVCache.apply_update(
+                            k_caches[cache_id], update, pos=chunk_len * i, style=style
+                        )
+                    for cache_id, update in v_cache_updates.items():
+                        v_caches[cache_id] = StaticKVCache.apply_update(
+                            v_caches[cache_id], update, pos=chunk_len * i, style=style
+                        )
+
+            y = torch.cat(ys, dim=1)
+            self.assertTrue(torch.isclose(y, expected, rtol=1e-3).all())
+
+        test_with_style("shift_pointer")
+        test_with_style("smart_mask")
 
     def test_within_transformer(self):
         config = ModelArgs(
@@ -133,48 +169,57 @@ def test_within_transformer(self):
         n_chunks = 3
         chunk_len = config.max_seq_len // n_chunks
         cache_len = config.max_seq_len - chunk_len
-        mask = torch.zeros(1, chunk_len, cache_len + chunk_len)
-        mask[:, :, :cache_len] = float("-inf")
-        mask[:, :, cache_len:] = torch.triu(
-            torch.full((1, chunk_len, chunk_len), float("-inf")),
-            diagonal=1,
-        )
-        k_caches = {
-            StaticKVCache.calculate_cache_key(layer_id, i): torch.zeros(
-                1, cache_len, config.head_dim
-            )
-            for layer_id in range(config.n_layers)
-            for i in range(config.n_kv_heads)
-        }
-        v_caches = {
-            StaticKVCache.calculate_cache_key(layer_id, i): torch.zeros(
-                1, cache_len, config.head_dim
-            )
-            for layer_id in range(config.n_layers)
-            for i in range(config.n_kv_heads)
-        }
-        ys = []
-        for i in range(n_chunks):
-            y_i, attn_update = static_transformer(
-                x[:, i * chunk_len : (i + 1) * chunk_len],
-                attn_options=ForwardOptions(
-                    mask=mask,
-                    freqs_cos_override=freqs_cos[i * chunk_len : (i + 1) * chunk_len],
-                    freqs_sin_override=freqs_sin[i * chunk_len : (i + 1) * chunk_len],
-                    in_cache_state=(k_caches, v_caches),
-                    out_cache_state=({}, {}),
-                ),
+
+        def test_with_style(style):
+            mask = StaticAttentionMask(chunk_len, cache_len, style=style)
+            mask.tensor[:, :, cache_len:] = torch.triu(
+                torch.full((1, chunk_len, chunk_len), float("-inf")),
+                diagonal=1,
             )
-            ys.append(y_i)
-            mask[:, :, cache_len - chunk_len * (i + 1) : cache_len] = 0
-            k_cache_updates, v_cache_updates = attn_update["out_cache_state"]
-            for cache_id, update in k_cache_updates.items():
-                k_caches[cache_id] = StaticKVCache.apply_update(
-                    k_caches[cache_id], update
+            k_caches = {
+                StaticKVCache.calculate_cache_key(layer_id, i): torch.zeros(
+                    1, cache_len, config.head_dim
                 )
-            for cache_id, update in v_cache_updates.items():
-                v_caches[cache_id] = StaticKVCache.apply_update(
-                    v_caches[cache_id], update
+                for layer_id in range(config.n_layers)
+                for i in range(config.n_kv_heads)
+            }
+            v_caches = {
+                StaticKVCache.calculate_cache_key(layer_id, i): torch.zeros(
+                    1, cache_len, config.head_dim
                 )
+                for layer_id in range(config.n_layers)
+                for i in range(config.n_kv_heads)
+            }
+            ys = []
+            for i in range(n_chunks):
+                y_i, attn_update = static_transformer(
+                    x[:, i * chunk_len : (i + 1) * chunk_len],
+                    attn_options=ForwardOptions(
+                        mask=mask.tensor,
+                        freqs_cos_override=freqs_cos[
+                            i * chunk_len : (i + 1) * chunk_len
+                        ],
+                        freqs_sin_override=freqs_sin[
+                            i * chunk_len : (i + 1) * chunk_len
+                        ],
+                        in_cache_state=(k_caches, v_caches),
+                        out_cache_state=({}, {}),
+                    ),
+                )
+                ys.append(y_i)
+                mask.unmask(chunk_len)
+                k_cache_updates, v_cache_updates = attn_update["out_cache_state"]
+                if i < n_chunks - 1:
+                    for cache_id, update in k_cache_updates.items():
+                        k_caches[cache_id] = StaticKVCache.apply_update(
+                            k_caches[cache_id], update, pos=chunk_len * i, style=style
+                        )
+                    for cache_id, update in v_cache_updates.items():
+                        v_caches[cache_id] = StaticKVCache.apply_update(
+                            v_caches[cache_id], update, pos=chunk_len * i, style=style
+                        )
+
+            self.assertTrue(torch.isclose(ys[-1], expected, rtol=1e-3).all())
 
-        self.assertTrue(torch.isclose(ys[-1], expected, rtol=1e-3).all())
+        test_with_style("shift_pointer")
+        test_with_style("smart_mask")
diff --git a/examples/models/llama3_2_vision/install_requirements.sh b/examples/models/llama3_2_vision/install_requirements.sh
index 4d4a6f2862..9076cb967d 100755
--- a/examples/models/llama3_2_vision/install_requirements.sh
+++ b/examples/models/llama3_2_vision/install_requirements.sh
@@ -5,7 +5,9 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-NIGHTLY_VERSION="dev20250115"
+set +ex
+
+NIGHTLY_VERSION="dev20250220"
 
 # Install torchtune nightly for model definitions.
 pip install --pre torchtune==0.6.0.${NIGHTLY_VERSION} --extra-index-url https://download.pytorch.org/whl/nightly/cpu --no-cache-dir
diff --git a/examples/qualcomm/oss_scripts/llama/TARGETS b/examples/qualcomm/oss_scripts/llama/TARGETS
index 3ef82293e0..e4bad10a23 100644
--- a/examples/qualcomm/oss_scripts/llama/TARGETS
+++ b/examples/qualcomm/oss_scripts/llama/TARGETS
@@ -19,6 +19,7 @@ python_library(
     name = "llama_lib",
     srcs = ["llama.py"],
     deps = [
+        "//executorch/examples/models/llama:source_transformation",
         "//caffe2:torch",
         "//executorch/backends/qualcomm/partition:partition",
         "//executorch/backends/qualcomm/quantizer:quantizer",
@@ -35,23 +36,12 @@ python_library(
 
 python_binary(
     name = "llama",
-    srcs = ["llama.py"],
     main_function = "executorch.examples.qualcomm.oss_scripts.llama.llama.main",
     preload_deps = [
         "//executorch/extension/llm/custom_ops:model_sharding_py",
     ],
     deps = [
-        "//executorch/examples/qualcomm/oss_scripts/llama:static_llama",
-        "//caffe2:torch",
-        "//executorch/extension/pybindings:aten_lib",
-        "//executorch/backends/qualcomm/partition:partition",
-        "//executorch/backends/qualcomm/quantizer:quantizer",
-        "//executorch/devtools/backend_debug:delegation_info",
-        "//executorch/devtools:lib",
-        "//executorch/examples/models:models",
-        "//executorch/examples/qualcomm:utils",
-        "//executorch/extension/export_util:export_util",
-        "//executorch/extension/llm/export:export_lib",
+        ":llama_lib",
     ],
 )
 
diff --git a/examples/qualcomm/oss_scripts/llama/llama.py b/examples/qualcomm/oss_scripts/llama/llama.py
index ab27714ae1..e853812a94 100755
--- a/examples/qualcomm/oss_scripts/llama/llama.py
+++ b/examples/qualcomm/oss_scripts/llama/llama.py
@@ -881,6 +881,10 @@ def post_process():
 
         adb.pull(output_path=args.artifact, callback=post_process)
     if args.ip and args.port != -1:
+        inference_speed = 0
+        with open(f"{args.artifact}/outputs/inference_speed.txt", "r") as f:
+            inference_speed = float(f.read())
+
         pte_size = os.path.getsize(pte_path)
         with Client((args.ip, args.port)) as conn:
             conn.send(
@@ -888,6 +892,7 @@ def post_process():
                     {
                         "result": outputs,
                         "pte_size": pte_size,
+                        "inference_speed": inference_speed,
                     }
                 )
             )
@@ -1034,10 +1039,7 @@ def _build_parser():
     return parser
 
 
-def main(args) -> None:
-    parser = _build_parser()
-
-    args = parser.parse_args(args)
+def export_llama(args) -> None:
     if args.compile_only and args.pre_gen_pte:
         exit("Cannot set both compile_only and pre_gen_pte as true")
 
@@ -1138,6 +1140,12 @@ def main(args) -> None:
             raise Exception(e)
 
 
+def main():
+    parser = _build_parser()
+    args = parser.parse_args()
+    export_llama(args)
+
+
 # flake8: noqa: C901
 if __name__ == "__main__":
-    main(sys.argv[1:])
+    main()
diff --git a/examples/qualcomm/oss_scripts/llama/runner/runner.cpp b/examples/qualcomm/oss_scripts/llama/runner/runner.cpp
index 4b45863147..70ba25a097 100644
--- a/examples/qualcomm/oss_scripts/llama/runner/runner.cpp
+++ b/examples/qualcomm/oss_scripts/llama/runner/runner.cpp
@@ -18,6 +18,7 @@
 #include <executorch/runtime/core/exec_aten/util/scalar_type_util.h>
 #include <executorch/runtime/platform/log.h>
 #include <ctime>
+#include <fstream>
 #include <sstream>
 
 using executorch::aten::Tensor;
@@ -518,6 +519,19 @@ void printReport(const Runner::Stats& stats) {
       stats.num_generated_tokens,
       (double)stats.aggregate_sampling_time_ms /
           stats.SCALING_FACTOR_UNITS_PER_SECOND);
+
+  // For now, we just print the total inference time for CI, can save more info
+  // in future if needed.
+  std::ofstream outfile("outputs/inference_speed.txt");
+  if (outfile.is_open()) {
+    double num_tok = (stats.num_generated_tokens) /
+        (double)(stats.inference_end_ms - stats.inference_start_ms) *
+        stats.SCALING_FACTOR_UNITS_PER_SECOND;
+    outfile << num_tok;
+    outfile.close();
+  } else {
+    ET_CHECK_MSG(false, "Error saving the inference speed file");
+  }
 }
 
 std::string statsToJsonString(const Runner::Stats& stats) {
diff --git a/exir/tests/test_joint_graph.py b/exir/tests/test_joint_graph.py
index 349fa92e82..fb74b70d31 100644
--- a/exir/tests/test_joint_graph.py
+++ b/exir/tests/test_joint_graph.py
@@ -18,6 +18,7 @@
 from torch.export._trace import _export
 from torch.export.experimental import _export_forward_backward
 from torch.export.exported_program import OutputKind
+from torch.testing import assert_close
 
 
 class TestJointGraph(unittest.TestCase):
@@ -100,7 +101,8 @@ def forward(self, x, y):
             example_inputs
         )  # ET outputs are [loss, grads, weights]
 
-        self.assertTrue(torch.allclose(loss, et_outputs[0]))
+        # Without rtol and atol, this test fails in macos.
+        assert_close(loss, et_outputs[0], rtol=1e-4, atol=1e-4)
         self.assertTrue(
             torch.allclose(m.linear.weight.grad, et_outputs[1])  # pyre-ignore
         )
diff --git a/extension/benchmark/android/benchmark/app/src/main/java/org/pytorch/minibench/BenchmarkActivity.java b/extension/benchmark/android/benchmark/app/src/main/java/org/pytorch/minibench/BenchmarkActivity.java
index 15f527475b..afa8fca323 100644
--- a/extension/benchmark/android/benchmark/app/src/main/java/org/pytorch/minibench/BenchmarkActivity.java
+++ b/extension/benchmark/android/benchmark/app/src/main/java/org/pytorch/minibench/BenchmarkActivity.java
@@ -44,6 +44,7 @@ protected void onCreate(Bundle savedInstanceState) {
             .get();
 
     int numIter = intent.getIntExtra("num_iter", 50);
+    int numWarmupIter = intent.getIntExtra("num_warm_up_iter", 5);
 
     // TODO: Format the string with a parsable format
     Stats stats = new Stats();
@@ -58,6 +59,10 @@ protected Void doInBackground(Void... voids) {
         stats.errorCode = module.loadMethod("forward");
         stats.loadEnd = System.nanoTime();
 
+        for (int i = 0; i < numWarmupIter; i++) {
+          module.forward();
+        }
+
         for (int i = 0; i < numIter; i++) {
           long start = System.nanoTime();
           module.forward();
diff --git a/extension/benchmark/apple/Benchmark/Tests/LLaMA/LLaMATests.mm b/extension/benchmark/apple/Benchmark/Tests/LLaMA/LLaMATests.mm
index 16c1c1c1d6..332c3986b0 100644
--- a/extension/benchmark/apple/Benchmark/Tests/LLaMA/LLaMATests.mm
+++ b/extension/benchmark/apple/Benchmark/Tests/LLaMA/LLaMATests.mm
@@ -82,7 +82,7 @@ @implementation LLaMATests
         return;
       }
       TokensPerSecondMetric *tokensPerSecondMetric = [TokensPerSecondMetric new];
-      [testCase measureWithMetrics:@[ tokensPerSecondMetric, [XCTMemoryMetric new] ]
+      [testCase measureWithMetrics:@[ tokensPerSecondMetric, [XCTClockMetric new], [XCTMemoryMetric new] ]
                             block:^{
                               tokensPerSecondMetric.tokenCount = 0;
                               const auto status = runner->generate(
diff --git a/extension/flat_tensor/serialize/targets.bzl b/extension/flat_tensor/serialize/targets.bzl
index 78054af30e..717418ec7e 100644
--- a/extension/flat_tensor/serialize/targets.bzl
+++ b/extension/flat_tensor/serialize/targets.bzl
@@ -39,7 +39,9 @@ def define_common_targets():
         name = "flat_tensor_header",
         srcs = ["flat_tensor_header.cpp"],
         exported_headers = ["flat_tensor_header.h"],
-        visibility = ["//executorch/..."],
+        visibility = [
+            "//executorch/...",
+        ],
         exported_deps = ["//executorch/runtime/core:core"],
     )
 
@@ -54,6 +56,7 @@ def define_common_targets():
         exported_headers = ["serialize.h"],
         visibility = [
             "//executorch/...",
+            "@EXECUTORCH_CLIENTS",
         ],
         exported_external_deps = ["flatbuffers-api"],
     )
diff --git a/extension/flat_tensor/targets.bzl b/extension/flat_tensor/targets.bzl
index ed2adefc58..6f627492f2 100644
--- a/extension/flat_tensor/targets.bzl
+++ b/extension/flat_tensor/targets.bzl
@@ -9,13 +9,15 @@ def define_common_targets():
         exported_headers = ["flat_tensor_data_map.h"],
         deps = [
             "//executorch/extension/flat_tensor/serialize:generated_headers",
-            "//executorch/extension/flat_tensor/serialize:flat_tensor_header",
             "//executorch/runtime/core:core",
             "//executorch/runtime/core:evalue",
             "//executorch/runtime/core:named_data_map",
             "//executorch/runtime/core/exec_aten:lib",
             "//executorch/runtime/core/exec_aten/util:tensor_util",
         ],
+        exported_deps = [
+            "//executorch/extension/flat_tensor/serialize:flat_tensor_header",
+        ],
         visibility = [
             "//executorch/...",
         ],
diff --git a/extension/module/CMakeLists.txt b/extension/module/CMakeLists.txt
index 70441265c6..d144ce9535 100644
--- a/extension/module/CMakeLists.txt
+++ b/extension/module/CMakeLists.txt
@@ -27,7 +27,7 @@ if(CMAKE_TOOLCHAIN_IOS
 else()
   add_library(extension_module SHARED ${_extension_module__srcs})
 endif()
-target_link_libraries(extension_module PRIVATE executorch extension_data_loader)
+target_link_libraries(extension_module PRIVATE executorch extension_data_loader extension_flat_tensor)
 target_include_directories(extension_module PUBLIC ${EXECUTORCH_ROOT}/..)
 target_compile_options(
   extension_module PUBLIC -Wno-deprecated-declarations -fPIC
@@ -37,7 +37,7 @@ target_compile_options(
 # after cleaning up CMake targets.
 add_library(extension_module_static STATIC ${_extension_module__srcs})
 target_link_libraries(
-  extension_module_static PRIVATE executorch extension_data_loader
+  extension_module_static PRIVATE executorch extension_data_loader extension_flat_tensor
 )
 target_include_directories(extension_module_static PUBLIC ${EXECUTORCH_ROOT}/..)
 target_compile_options(
diff --git a/extension/module/module.cpp b/extension/module/module.cpp
index 99cc7e38bd..aa750e2691 100644
--- a/extension/module/module.cpp
+++ b/extension/module/module.cpp
@@ -10,6 +10,7 @@
 
 #include <executorch/extension/data_loader/file_data_loader.h>
 #include <executorch/extension/data_loader/mmap_data_loader.h>
+#include <executorch/extension/flat_tensor/flat_tensor_data_map.h>
 #include <executorch/extension/memory_allocator/malloc_memory_allocator.h>
 #include <executorch/runtime/platform/runtime.h>
 
@@ -36,15 +37,59 @@
 namespace executorch {
 namespace extension {
 
+namespace {
+runtime::Result<std::unique_ptr<runtime::DataLoader>> load_file(
+    const std::string& file_path,
+    Module::LoadMode mode) {
+  std::unique_ptr<runtime::DataLoader> res = nullptr;
+  switch (mode) {
+    case Module::LoadMode::File:
+      res = ET_UNWRAP_UNIQUE(FileDataLoader::from(file_path.c_str()));
+      break;
+    case Module::LoadMode::Mmap:
+      res = ET_UNWRAP_UNIQUE(MmapDataLoader::from(
+          file_path.c_str(), MmapDataLoader::MlockConfig::NoMlock));
+      break;
+    case Module::LoadMode::MmapUseMlock:
+      res = ET_UNWRAP_UNIQUE(MmapDataLoader::from(file_path.c_str()));
+      break;
+    case Module::LoadMode::MmapUseMlockIgnoreErrors:
+      res = ET_UNWRAP_UNIQUE(MmapDataLoader::from(
+          file_path.c_str(),
+          MmapDataLoader::MlockConfig::UseMlockIgnoreErrors));
+      break;
+  }
+  return res;
+}
+} // namespace
+
+Module::Module(
+    const std::string& file_path,
+    const LoadMode load_mode,
+    std::unique_ptr<runtime::EventTracer> event_tracer)
+    : file_path_(file_path),
+      load_mode_(load_mode),
+      memory_allocator_(std::make_unique<MallocMemoryAllocator>()),
+      temp_allocator_(std::make_unique<MallocMemoryAllocator>()),
+      event_tracer_(std::move(event_tracer)),
+      data_map_loader_(nullptr),
+      data_map_(nullptr) {
+  runtime::runtime_init();
+}
+
 Module::Module(
     const std::string& file_path,
+    const std::string& data_map_path,
     const LoadMode load_mode,
     std::unique_ptr<runtime::EventTracer> event_tracer)
     : file_path_(file_path),
+      data_map_path_(data_map_path),
       load_mode_(load_mode),
       memory_allocator_(std::make_unique<MallocMemoryAllocator>()),
       temp_allocator_(std::make_unique<MallocMemoryAllocator>()),
-      event_tracer_(std::move(event_tracer)) {
+      event_tracer_(std::move(event_tracer)),
+      data_map_loader_(nullptr),
+      data_map_(nullptr) {
   runtime::runtime_init();
 }
 
@@ -52,7 +97,8 @@ Module::Module(
     std::unique_ptr<runtime::DataLoader> data_loader,
     std::unique_ptr<runtime::MemoryAllocator> memory_allocator,
     std::unique_ptr<runtime::MemoryAllocator> temp_allocator,
-    std::unique_ptr<runtime::EventTracer> event_tracer)
+    std::unique_ptr<runtime::EventTracer> event_tracer,
+    std::unique_ptr<runtime::DataLoader> data_map_loader)
     : data_loader_(std::move(data_loader)),
       memory_allocator_(
           memory_allocator ? std::move(memory_allocator)
@@ -60,7 +106,9 @@ Module::Module(
       temp_allocator_(
           temp_allocator ? std::move(temp_allocator)
                          : std::make_unique<MallocMemoryAllocator>()),
-      event_tracer_(std::move(event_tracer)) {
+      event_tracer_(std::move(event_tracer)),
+      data_map_loader_(std::move(data_map_loader)),
+      data_map_(nullptr) {
   runtime::runtime_init();
 }
 
@@ -68,7 +116,8 @@ Module::Module(
     std::shared_ptr<runtime::Program> program,
     std::unique_ptr<runtime::MemoryAllocator> memory_allocator,
     std::unique_ptr<runtime::MemoryAllocator> temp_allocator,
-    std::unique_ptr<runtime::EventTracer> event_tracer)
+    std::unique_ptr<runtime::EventTracer> event_tracer,
+    std::unique_ptr<runtime::DataLoader> data_map_loader)
     : program_(std::move(program)),
       memory_allocator_(
           memory_allocator ? std::move(memory_allocator)
@@ -76,33 +125,37 @@ Module::Module(
       temp_allocator_(
           temp_allocator ? std::move(temp_allocator)
                          : std::make_unique<MallocMemoryAllocator>()),
-      event_tracer_(std::move(event_tracer)) {
+      event_tracer_(std::move(event_tracer)),
+      data_map_loader_(std::move(data_map_loader)),
+      data_map_(nullptr) {
   runtime::runtime_init();
 }
 
 runtime::Error Module::load(const runtime::Program::Verification verification) {
   if (!is_loaded()) {
+    // Load the program
     if (!data_loader_) {
-      switch (load_mode_) {
-        case LoadMode::File:
-          data_loader_ =
-              ET_UNWRAP_UNIQUE(FileDataLoader::from(file_path_.c_str()));
-          break;
-        case LoadMode::Mmap:
-          data_loader_ = ET_UNWRAP_UNIQUE(MmapDataLoader::from(
-              file_path_.c_str(), MmapDataLoader::MlockConfig::NoMlock));
-          break;
-        case LoadMode::MmapUseMlock:
-          data_loader_ =
-              ET_UNWRAP_UNIQUE(MmapDataLoader::from(file_path_.c_str()));
-          break;
-        case LoadMode::MmapUseMlockIgnoreErrors:
-          data_loader_ = ET_UNWRAP_UNIQUE(MmapDataLoader::from(
-              file_path_.c_str(),
-              MmapDataLoader::MlockConfig::UseMlockIgnoreErrors));
-          break;
+      auto res = load_file(file_path_, load_mode_);
+      if (!res.ok()) {
+        return res.error();
       }
-    };
+      data_loader_ = std::move(res.get());
+    }
+    // If a .ptd path was given load it.
+    if (data_map_path_ != "") {
+      auto res = load_file(data_map_path_, load_mode_);
+      if (!res.ok()) {
+        return res.error();
+      }
+      data_map_loader_ = std::move(res.get());
+    }
+    // If we have a .ptd loader, then load the map.
+    if (data_map_loader_) {
+      data_map_ =
+          ET_UNWRAP_UNIQUE(FlatTensorDataMap::load(data_map_loader_.get()));
+    }
+    // else: either the map itself was provided or we have no data map, either
+    // way no work to do.
     auto program = ET_UNWRAP_UNIQUE(
         runtime::Program::load(data_loader_.get(), verification));
     program_ = std::shared_ptr<runtime::Program>(
@@ -130,6 +183,7 @@ runtime::Error Module::load_method(
     ET_CHECK_OK_OR_RETURN_ERROR(load());
 
     MethodHolder method_holder;
+
     const auto method_metadata =
         ET_UNWRAP(program_->method_meta(method_name.c_str()));
     const auto planned_buffersCount =
@@ -155,7 +209,8 @@ runtime::Error Module::load_method(
     method_holder.method = ET_UNWRAP_UNIQUE(program_->load_method(
         method_name.c_str(),
         method_holder.memory_manager.get(),
-        event_tracer ? event_tracer : this->event_tracer()));
+        event_tracer ? event_tracer : this->event_tracer(),
+        data_map_.get()));
     method_holder.inputs.resize(method_holder.method->inputs_size());
     methods_.emplace(method_name, std::move(method_holder));
   }
diff --git a/extension/module/module.h b/extension/module/module.h
index 45ed38a7ff..dc7c930d7c 100644
--- a/extension/module/module.h
+++ b/extension/module/module.h
@@ -51,6 +51,21 @@ class Module {
       const LoadMode load_mode = LoadMode::MmapUseMlock,
       std::unique_ptr<runtime::EventTracer> event_tracer = nullptr);
 
+  /**
+   * Constructs an instance by loading a program from a file with specified
+   * memory locking behavior.
+   *
+   * @param[in] file_path The path to the ExecuTorch program file to load.
+   * @param[in] data_map_path The path to a .ptd file
+   * @param[in] load_mode The loading mode to use.
+   * @param[in] event_tracer A EventTracer used for tracking and logging events.
+   */
+  explicit Module(
+      const std::string& file_path,
+      const std::string& data_map_path,
+      const LoadMode load_mode = LoadMode::MmapUseMlock,
+      std::unique_ptr<runtime::EventTracer> event_tracer = nullptr);
+
   /**
    * Constructs an instance with the provided data loader and memory allocator.
    *
@@ -59,12 +74,14 @@ class Module {
    * @param[in] temp_allocator A MemoryAllocator to use when allocating
    * temporary data during kernel or delegate execution.
    * @param[in] event_tracer A EventTracer used for tracking and logging events.
+   * @param[in] data_map_loader A DataLoader used for loading external weights.
    */
   explicit Module(
       std::unique_ptr<runtime::DataLoader> data_loader,
       std::unique_ptr<runtime::MemoryAllocator> memory_allocator = nullptr,
       std::unique_ptr<runtime::MemoryAllocator> temp_allocator = nullptr,
-      std::unique_ptr<runtime::EventTracer> event_tracer = nullptr);
+      std::unique_ptr<runtime::EventTracer> event_tracer = nullptr,
+      std::unique_ptr<runtime::DataLoader> data_map_loader = nullptr);
 
   /**
    * Constructs an instance using an existing shared program.
@@ -75,12 +92,14 @@ class Module {
    * @param[in] temp_allocator A MemoryAllocator to use when allocating
    * temporary data.
    * @param[in] event_tracer A EventTracer used for tracking and logging events.
+   * @param[in] data_map_loader A DataLoader used for loading external weights.
    */
   explicit Module(
       std::shared_ptr<runtime::Program> program,
       std::unique_ptr<runtime::MemoryAllocator> memory_allocator = nullptr,
       std::unique_ptr<runtime::MemoryAllocator> temp_allocator = nullptr,
-      std::unique_ptr<runtime::EventTracer> event_tracer = nullptr);
+      std::unique_ptr<runtime::EventTracer> event_tracer = nullptr,
+      std::unique_ptr<runtime::DataLoader> data_map_loader = nullptr);
 
   Module(const Module&) = delete;
   Module& operator=(const Module&) = delete;
@@ -433,14 +452,16 @@ class Module {
     std::vector<runtime::EValue> inputs;
   };
 
- private:
   std::string file_path_;
+  std::string data_map_path_;
   LoadMode load_mode_{LoadMode::MmapUseMlock};
   std::shared_ptr<runtime::Program> program_;
   std::unique_ptr<runtime::DataLoader> data_loader_;
   std::unique_ptr<runtime::MemoryAllocator> memory_allocator_;
   std::unique_ptr<runtime::MemoryAllocator> temp_allocator_;
   std::unique_ptr<runtime::EventTracer> event_tracer_;
+  std::unique_ptr<runtime::DataLoader> data_map_loader_;
+  std::unique_ptr<runtime::NamedDataMap> data_map_;
 
  protected:
   std::unordered_map<std::string, MethodHolder> methods_;
diff --git a/extension/module/targets.bzl b/extension/module/targets.bzl
index 61251047dc..4cbfa0ca0f 100644
--- a/extension/module/targets.bzl
+++ b/extension/module/targets.bzl
@@ -25,6 +25,7 @@ def define_common_targets():
                 "//executorch/extension/memory_allocator:malloc_memory_allocator",
                 "//executorch/extension/data_loader:file_data_loader",
                 "//executorch/extension/data_loader:mmap_data_loader",
+                "//executorch/extension/flat_tensor:flat_tensor_data_map",
             ],
             exported_deps = [
                 "//executorch/runtime/executor:program" + aten_suffix,
diff --git a/extension/module/test/module_test.cpp b/extension/module/test/module_test.cpp
index 2dbb0fea93..ac7d4db13a 100644
--- a/extension/module/test/module_test.cpp
+++ b/extension/module/test/module_test.cpp
@@ -22,14 +22,20 @@ using namespace ::executorch::runtime;
 class ModuleTest : public ::testing::Test {
  protected:
   static void SetUpTestSuite() {
-    model_path_ = std::getenv("RESOURCES_PATH") + std::string("/add.pte");
+    std::string resources_path;
+    if (const char* env = std::getenv("RESOURCES_PATH")) {
+      resources_path = env;
+    }
+    model_path_ = resources_path + "/add.pte";
+    linear_path_ = resources_path + "/linear.pte";
+    linear_data_path_ = resources_path + "/linear.ptd";
   }
 
-  static std::string model_path_;
+  static inline std::string model_path_;
+  static inline std::string linear_path_;
+  static inline std::string linear_data_path_;
 };
 
-std::string ModuleTest::model_path_;
-
 TEST_F(ModuleTest, TestLoad) {
   Module module(model_path_);
 
@@ -435,3 +441,14 @@ TEST_F(ModuleTest, TestSetOutputInvalidType) {
 
   EXPECT_NE(module.set_output(EValue()), Error::Ok);
 }
+
+TEST_F(ModuleTest, TestPTD) {
+  Module module(linear_path_, linear_data_path_);
+
+  ASSERT_EQ(module.load_method("forward"), Error::Ok);
+
+  auto tensor1 =
+      make_tensor_ptr({3, 3}, {2.f, 3.f, 4.f, 2.f, 3.f, 4.f, 2.f, 3.f, 4.f});
+
+  ASSERT_EQ(module.forward(tensor1).error(), Error::Ok);
+}
diff --git a/extension/module/test/resources/README.md b/extension/module/test/resources/README.md
index e2b54633fa..ecbdd41c10 100644
--- a/extension/module/test/resources/README.md
+++ b/extension/module/test/resources/README.md
@@ -1,11 +1,23 @@
 ## Resources
 
-### model.pte
+### add.pte, linear.pte, linear.ptd
 - Internally generated after D62209852, 2024-09-06 with:
     ```
     buck2 run fbcode//executorch/examples/portable/scripts:export -- --model_name="add"
     ```
+
+    and
+
+    ```
+    buck2 run fbcode//executorch/examples/portable/scripts:export -- --model_name="linear" -examples
+    ```
 - In OSS, the same file can be generated after [#5145](https://github.com/pytorch/executorch/pull/5145), 2024-09-06 with:
     ```
     python -m examples.portable.scripts.export --model_name="add"
     ```
+
+    and
+
+    ```
+    python -m examples.portable.scripts.export --model_name="linear" -e
+    ```
diff --git a/extension/module/test/resources/linear.ptd b/extension/module/test/resources/linear.ptd
new file mode 100644
index 0000000000000000000000000000000000000000..edab857bb3f24db7f53e812ae6949588d6e62fb8
GIT binary patch
literal 336
zcmZ=^U|?_yF)(!VFfh~rvJHS31Rj7%1_l8}2*Ux&hfzEX0zf7U0|x^S5CYW!u?kp(
zfdQliBnFa$s9|7m0J1=U35XMbSOka#7-WEI*nz4*x<$YwkiiBrM*@gJx?%2tIs<Gr
zGf<oxh;uUYQWJ~xk}?yEfoetIW`Xnp0h<DlfVdK3FGw@Uevp0;<^^J?8Re;&=@}(J
z{`9~`yFY&_?RXr1@2g)rf8Wfb({{gVIcE2$x5|dW@Q-bW{?mOxWpJ?5rpeAx?DF2N
J_loU6f&j|=Dg*!k

literal 0
HcmV?d00001

diff --git a/extension/module/test/resources/linear.pte b/extension/module/test/resources/linear.pte
new file mode 100644
index 0000000000000000000000000000000000000000..707815ad8810e481887667342751cb5d8d8c0b00
GIT binary patch
literal 1208
zcmZuxJxc>Y5S`O}oEk+$B!xvFDJ~Es5epHE6bk-;AXe&WP7N3%F%kqz3rn%Gu&}VS
zu(Y)F7l=Q=A7P=M@9pfyD>C8T?#<4;nVp%-yojtfYKvH8TJm_N#FrfQvw+Adumn_q
zc_0Nm{>Cv^4{*Xgbq_yroYc`rWKMzvZyrnqDM1TfIDL(Ztbt$|r~uc{2S5QJ=Lq2B
z#%=7%Gv-FSt?xKy&fJ4iI_Eu^z#8_Vla-aQ*=)Dva)`~$z~(gSw$FRf&VJ|gLhnaD
zk#W>Q>oShFE8r4vxz+7{OYiPc|A{!bim}OJ<(|1RFCQ2Ks3&f$Uk-cUoHCaBl8cf7
z=u0>uXOf$_RS@I_2fI83KCrw3tmy_20OWi_53!6eCfRe)G6vl!C3{ZW?O5+(4)Cb!
zS(;~?ID@~(KD*k#TYJ+t5W0Ia_qwr{tc&a0xa6W8S4vtJ^L)0sXiZYa=303M`LjRC
z{OS0lHfc_^$P6Z$7MyNYZ5kR%#w*}C3o!OQVzCBQU|9aXOu<{b1d{|eImu0N+>$Ba
z&2fKO?p^4Hao^c|S<5_r|Jmvkl1cB9@%T-$m&}!U->R>fJ<j}b>m&-h3wy2bY|vBM
z>in?+|98-MbBTB`G@}0qtWh2BjOo4yoz*P93?nA)B(^qgKWZHw_3%FIo~#lloc(k&
l!`w;$Yt>jR){51jQ7hJhQn|8L4a#MxP1g=O-G10@${&<td>jA(

literal 0
HcmV?d00001

diff --git a/extension/pybindings/portable_lib.py b/extension/pybindings/portable_lib.py
index 25624ad60c..24097fea6a 100644
--- a/extension/pybindings/portable_lib.py
+++ b/extension/pybindings/portable_lib.py
@@ -38,6 +38,7 @@
     _create_profile_block,  # noqa: F401
     _dump_profile_results,  # noqa: F401
     _get_operator_names,  # noqa: F401
+    _get_registered_backend_names,  # noqa: F401
     _load_bundled_program_from_buffer,  # noqa: F401
     _load_for_executorch,  # noqa: F401
     _load_for_executorch_from_buffer,  # noqa: F401
diff --git a/extension/pybindings/pybindings.cpp b/extension/pybindings/pybindings.cpp
index 97bff67114..f17ddbbbc3 100644
--- a/extension/pybindings/pybindings.cpp
+++ b/extension/pybindings/pybindings.cpp
@@ -23,6 +23,7 @@
 #include <executorch/extension/data_loader/buffer_data_loader.h>
 #include <executorch/extension/data_loader/mmap_data_loader.h>
 #include <executorch/extension/memory_allocator/malloc_memory_allocator.h>
+#include <executorch/runtime/backend/interface.h>
 #include <executorch/runtime/core/data_loader.h>
 #include <executorch/runtime/core/exec_aten/util/scalar_type_util.h>
 #include <executorch/runtime/executor/method.h>
@@ -91,6 +92,8 @@ using ::executorch::runtime::DataLoader;
 using ::executorch::runtime::Error;
 using ::executorch::runtime::EValue;
 using ::executorch::runtime::EventTracerDebugLogLevel;
+using ::executorch::runtime::get_backend_name;
+using ::executorch::runtime::get_num_registered_backends;
 using ::executorch::runtime::get_registered_kernels;
 using ::executorch::runtime::HierarchicalAllocator;
 using ::executorch::runtime::Kernel;
@@ -975,6 +978,18 @@ py::list get_operator_names() {
   return res;
 }
 
+py::list get_registered_backend_names() {
+  size_t n_of_registered_backends = get_num_registered_backends();
+  py::list res;
+  for (size_t i = 0; i < n_of_registered_backends; i++) {
+    auto backend_name_res = get_backend_name(i);
+    THROW_IF_ERROR(backend_name_res.error(), "Failed to get backend name");
+    auto backend_name = backend_name_res.get();
+    res.append(backend_name);
+  }
+  return res;
+}
+
 } // namespace
 
 PYBIND11_MODULE(EXECUTORCH_PYTHON_MODULE_NAME, m) {
@@ -1028,6 +1043,10 @@ PYBIND11_MODULE(EXECUTORCH_PYTHON_MODULE_NAME, m) {
             prof_result.num_bytes);
       },
       call_guard);
+  m.def(
+      "_get_registered_backend_names",
+      &get_registered_backend_names,
+      call_guard);
   m.def("_get_operator_names", &get_operator_names);
   m.def("_create_profile_block", &create_profile_block, call_guard);
   m.def(
diff --git a/extension/pybindings/pybindings.pyi b/extension/pybindings/pybindings.pyi
index fc44ce388a..a380e90528 100644
--- a/extension/pybindings/pybindings.pyi
+++ b/extension/pybindings/pybindings.pyi
@@ -220,6 +220,15 @@ def _get_operator_names() -> List[str]:
     """
     ...
 
+@experimental("This API is experimental and subject to change without notice.")
+def _get_registered_backend_names() -> List[str]:
+    """
+    .. warning::
+
+        This API is experimental and subject to change without notice.
+    """
+    ...
+
 @experimental("This API is experimental and subject to change without notice.")
 def _create_profile_block(name: str) -> None:
     """
diff --git a/extension/pybindings/test/TARGETS b/extension/pybindings/test/TARGETS
index 73063deb65..4770bebbcc 100644
--- a/extension/pybindings/test/TARGETS
+++ b/extension/pybindings/test/TARGETS
@@ -47,3 +47,11 @@ runtime.python_test(
         "//executorch/kernels/quantized:aot_lib",
     ],
 )
+
+runtime.python_test(
+    name = "test_backend_pybinding",
+    srcs = ["test_backend_pybinding.py"],
+    deps = [
+        "//executorch/runtime:runtime",
+    ],
+)
diff --git a/extension/pybindings/test/test_backend_pybinding.py b/extension/pybindings/test/test_backend_pybinding.py
new file mode 100644
index 0000000000..fbdc2be779
--- /dev/null
+++ b/extension/pybindings/test/test_backend_pybinding.py
@@ -0,0 +1,14 @@
+import unittest
+
+from executorch.runtime import Runtime
+
+
+class TestBackendsPybinding(unittest.TestCase):
+    def test_backend_name_list(
+        self,
+    ) -> None:
+
+        runtime = Runtime.get()
+        registered_backend_names = runtime.backend_registry.registered_backend_names
+        self.assertGreaterEqual(len(registered_backend_names), 1)
+        self.assertIn("XnnpackBackend", registered_backend_names)
diff --git a/kernels/optimized/cpu/op_log_softmax.cpp b/kernels/optimized/cpu/op_log_softmax.cpp
index c3f090a6df..1d2467bca5 100644
--- a/kernels/optimized/cpu/op_log_softmax.cpp
+++ b/kernels/optimized/cpu/op_log_softmax.cpp
@@ -14,6 +14,8 @@
 #include <cmath>
 #include <type_traits>
 
+#include <ATen/cpu/vec/functional.h>
+#include <ATen/cpu/vec/vec.h>
 #include <executorch/kernels/portable/cpu/util/activation_ops_util.h>
 #include <executorch/runtime/kernel/kernel_includes.h>
 
@@ -66,30 +68,30 @@ void log_softmax_kernel(const Tensor& input, int64_t dim, Tensor& out) {
       }
       // calculate sum and exponential in softmax dim
       OUT_T temp_sum = 0;
-#ifndef __aarch64__
-      for (auto d = 0; d < dim_size; ++d) {
-        output_data[d * dim_stride] =
-            std::exp(input_data[d * dim_stride] - max_input);
-        temp_sum += output_data[d * dim_stride];
-      }
-#else
+      using VecOut = at::vec::Vectorized<OUT_T>;
+      using VecIn = at::vec::Vectorized<IN_T>;
       auto d = 0;
-      for (; d + 4 < dim_size; d += 4) {
+      static_assert(sizeof(IN_T) == sizeof(OUT_T));
+      static_assert(
+          std::is_same_v<OUT_T, float>,
+          "Below loop actually only supports float.");
+      const VecIn max_input_vec(max_input);
+      for (; d + VecOut::size() < dim_size; d += VecOut::size()) {
         auto index = d * dim_stride;
-        float32x4_t in =
-            vld1q_f32(static_cast<const float*>(&input_data[index]));
-        float32x4_t out_ =
-            Sleef_expf4_u10(vsubq_f32(in, vmovq_n_f32(max_input)));
-        vst1q_f32(static_cast<float*>(&output_data[index]), out_);
+        auto in = VecIn::loadu(&input_data[index]);
+        auto out_ = (in - max_input_vec).exp();
+        out_.store(&output_data[index]);
+#if defined(__aarch64__) && !defined(CPU_CAPABILITY_SVE)
         temp_sum += vaddvq_f32(out_);
+#else
+        temp_sum += at::vec::vec_reduce_all<float>(std::plus<VecOut>(), out_);
+#endif
       }
-
       for (; d < dim_size; ++d) {
         output_data[d * dim_stride] =
             std::exp(input_data[d * dim_stride] - max_input);
         temp_sum += output_data[d * dim_stride];
       }
-#endif // __aarch64__
 
       temp_sum = std::log(temp_sum);
 
diff --git a/kernels/optimized/cpu/op_sub.cpp b/kernels/optimized/cpu/op_sub.cpp
index 7ee880d997..489421f1b2 100644
--- a/kernels/optimized/cpu/op_sub.cpp
+++ b/kernels/optimized/cpu/op_sub.cpp
@@ -15,6 +15,8 @@
 #include <executorch/runtime/kernel/kernel_includes.h>
 #include <executorch/runtime/platform/assert.h>
 
+#include <executorch/kernels/optimized/cpu/op_add_sub_impl.h>
+
 namespace torch {
 namespace executor {
 namespace native {
@@ -138,110 +140,9 @@ Tensor& opt_sub_out(
     }
   }
 
-  auto selected_optimized_path = select_optimized_path(a, b, out);
-  if (selected_optimized_path == ElementwiseOptimizedPath::kTreatAs1d) {
-    // Resize for dynamic shape
-    auto error = resize_tensor(out, a.sizes());
-    ET_KERNEL_CHECK_MSG(
-        ctx,
-        error == Error::Ok,
-        InvalidArgument,
-        out,
-        "Failed to resize output tensor.");
-
-    ET_SWITCH_REAL_TYPES(a_type, ctx, "sub.out", CTYPE, [&]() {
-      CTYPE alpha_val;
-      ET_KERNEL_CHECK(
-          ctx, utils::extract_scalar(alpha, &alpha_val), InvalidArgument, );
-
-      using Vec = executorch::vec::Vectorized<CTYPE>;
-      executorch::vec::map2<CTYPE>(
-          [alpha_val](Vec x, Vec y) { return x - Vec(alpha_val) * y; },
-          out.mutable_data_ptr<CTYPE>(),
-          a.const_data_ptr<CTYPE>(),
-          b.const_data_ptr<CTYPE>(),
-          out.numel());
-    });
-  } else if (selected_optimized_path != ElementwiseOptimizedPath::kNone) {
-    const Tensor* lhs;
-    const Tensor* rhs;
-    if (selected_optimized_path ==
-        ElementwiseOptimizedPath::kBroadcast2dBy1dReverseArguments) {
-      lhs = &b;
-      rhs = &a;
-    } else {
-      // Catch failure to update logic when subing new broadcasting possibility.
-      ET_DCHECK(
-          selected_optimized_path ==
-          ElementwiseOptimizedPath::kBroadcast2dBy1d);
-      lhs = &a;
-      rhs = &b;
-    }
-    auto error = resize_tensor(out, lhs->sizes());
-    ET_KERNEL_CHECK_MSG(
-        ctx,
-        error == Error::Ok,
-        InvalidArgument,
-        out,
-        "Failed to resize output tensor.");
-    ET_SWITCH_REAL_TYPES(out_type, ctx, "sub.out", CTYPE, [&]() {
-      CTYPE alpha_val;
-      ET_KERNEL_CHECK(
-          ctx, utils::extract_scalar(alpha, &alpha_val), InvalidArgument, );
-
-      using Vec = executorch::vec::Vectorized<CTYPE>;
-      if (selected_optimized_path ==
-          ElementwiseOptimizedPath::kBroadcast2dBy1dReverseArguments) {
-        executorch::vec::broadcasting_map_2d_by_1d<CTYPE>(
-            [alpha_val](Vec x, Vec y) { return y - Vec(alpha_val) * x; },
-            out.mutable_data_ptr<CTYPE>(),
-            lhs->const_data_ptr<CTYPE>(),
-            rhs->const_data_ptr<CTYPE>(),
-            lhs->sizes()[lhs->dim() - 2],
-            lhs->sizes()[lhs->dim() - 1]);
-      } else {
-        executorch::vec::broadcasting_map_2d_by_1d<CTYPE>(
-            [alpha_val](Vec x, Vec y) { return x - Vec(alpha_val) * y; },
-            out.mutable_data_ptr<CTYPE>(),
-            lhs->const_data_ptr<CTYPE>(),
-            rhs->const_data_ptr<CTYPE>(),
-            lhs->sizes()[lhs->dim() - 2],
-            lhs->sizes()[lhs->dim() - 1]);
-      }
-    });
-  } else {
-    ScalarType common_type =
-        promoteTypes(a_type, b_type, /*half_to_float*/ true);
-    ET_KERNEL_CHECK(ctx, canCast(common_type, out_type), InvalidArgument, out);
-
-    ET_KERNEL_CHECK(
-        ctx,
-        resize_to_broadcast_target_size(a, b, out) == Error::Ok,
-        InvalidArgument,
-        out);
-
-    ET_SWITCH_REALH_TYPES(a_type, ctx, "sub.out", CTYPE_A, [&]() {
-      ET_SWITCH_REALH_TYPES(b_type, ctx, "sub.out", CTYPE_B, [&]() {
-        using CTYPE_IN = typename torch::executor::
-            promote_types<CTYPE_A, CTYPE_B, /*half_to_float*/ true>::type;
-        ET_DCHECK(CppTypeToScalarType<CTYPE_IN>::value == common_type);
-        ET_SWITCH_REALH_TYPES(out_type, ctx, "sub.out", CTYPE_OUT, [&]() {
-          CTYPE_IN alpha_val;
-          ET_KERNEL_CHECK(
-              ctx, utils::extract_scalar(alpha, &alpha_val), InvalidArgument, );
-
-          SubInner<
-              can_cast<CTYPE_IN, CTYPE_OUT>::value,
-              CTYPE_A,
-              CTYPE_B,
-              CTYPE_IN,
-              CTYPE_OUT>::run(a, b, alpha_val, out);
-        });
-      });
-    });
-  }
-
-  return out;
+  static constexpr const char op_name[] = "sub.out";
+  return torch::executor::kernels::impl::opt_add_sub_out_impl<true, op_name>(
+      ctx, a, b, alpha, out);
 }
 
 Tensor& opt_sub_scalar_out(
diff --git a/kernels/optimized/cpu/targets.bzl b/kernels/optimized/cpu/targets.bzl
index 94ceb1f4dc..2a66407a5c 100644
--- a/kernels/optimized/cpu/targets.bzl
+++ b/kernels/optimized/cpu/targets.bzl
@@ -57,15 +57,10 @@ _OPTIMIZED_ATEN_OPS = (
     ),
     op_target(
         name = "op_log_softmax",
-        deps = select({
-            "DEFAULT": [
-                "//executorch/kernels/portable/cpu/util:activation_ops_util",
-            ],
-            "ovr_config//cpu:arm64": [
-                "//executorch/kernels/portable/cpu/util:activation_ops_util",
-                "fbsource//third-party/sleef:sleef_arm",
-            ],
-        }),
+        deps = [
+            "//executorch/kernels/portable/cpu/util:activation_ops_util",
+            "//executorch/runtime/core/portable_type/c10/c10:aten_headers_for_executorch",
+        ],
     ),
     op_target(
         name = "op_mm",
@@ -95,6 +90,7 @@ _OPTIMIZED_ATEN_OPS = (
         name = "op_sub",
         deps = [
             ":binary_ops",
+            ":add_sub_impl",
             "//executorch/kernels/portable/cpu:scalar_utils",
             "//executorch/kernels/portable/cpu/util:broadcast_util",
         ],
diff --git a/kernels/portable/cpu/util/test/broadcast_test.cpp b/kernels/portable/cpu/util/test/broadcast_test.cpp
index 679296f112..7ffd95b6c5 100644
--- a/kernels/portable/cpu/util/test/broadcast_test.cpp
+++ b/kernels/portable/cpu/util/test/broadcast_test.cpp
@@ -13,6 +13,7 @@
 #include <executorch/runtime/core/exec_aten/testing_util/tensor_factory.h>
 #include <executorch/runtime/core/exec_aten/testing_util/tensor_util.h>
 #include <executorch/runtime/core/exec_aten/util/scalar_type_util.h>
+#include <executorch/runtime/platform/platform.h>
 #include <executorch/test/utils/DeathTest.h>
 
 #include <gtest/gtest.h>
@@ -131,6 +132,7 @@ TEST(BroadcastUtilTest, GetBroadcastTargetSize) {
           .equals(ArrayRef<Tensor::SizesType>({5, 2, 2})));
 
   Tensor c = tf.zeros({4, 5});
+  et_pal_init();
   err = get_broadcast_target_size(
       a,
       c,
diff --git a/kernels/test/op_sub_test.cpp b/kernels/test/op_sub_test.cpp
index 39fc9e1492..aafaf688b0 100644
--- a/kernels/test/op_sub_test.cpp
+++ b/kernels/test/op_sub_test.cpp
@@ -99,6 +99,109 @@ class OpSubOutTest : public OperatorTest {
     EXPECT_TENSOR_CLOSE(out, tf.make(sizes, /*data=*/{0.1, 1.2, 3.4, 7.8}));
   }
 
+  template <ScalarType DTYPE>
+  void test_broadcast_3D() {
+    TensorFactory<DTYPE> tf_a;
+
+    Tensor a =
+        tf_a.make({2, 2, 3}, /*data=*/{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
+    Tensor b = tf_a.make({2, 1, 3}, /*data=*/{2, 3, 4, 5, 6, 7});
+
+    // Destination for output of mul.
+    Tensor out =
+        tf_a.make({2, 2, 3}, /*data=*/{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
+    Tensor expected =
+        tf_a.make({2, 2, 3}, /*data=*/{-1, -1, -1, 2, 2, 2, 2, 2, 2, 5, 5, 5});
+
+    // Check that it matches the expected output.
+    EXPECT_TENSOR_CLOSE(op_sub_out(a, b, 1.0, out), expected);
+    // b - a * 1.5 output should be
+    expected = tf_a.make(
+        {2, 2, 3},
+        /*data=*/
+        {0.5,
+         0.0,
+         -0.5,
+         -4.0,
+         -4.5,
+         -5.0,
+         -5.5,
+         -6.0,
+         -6.5,
+         -10.0,
+         -10.5,
+         -11.0});
+    EXPECT_TENSOR_CLOSE(op_sub_out(b, a, 1.5, out), expected);
+  }
+
+  template <ScalarType DTYPE>
+  void test_broadcast_4D() {
+    TensorFactory<DTYPE> tf_a;
+
+    Tensor a = tf_a.make(
+        {2, 2, 3, 5},
+        /*data=*/{1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+                  16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30,
+                  31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45,
+                  46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60});
+    Tensor b = tf_a.make(
+        {2, 1, 3, 5},
+        /*data=*/{1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+                  16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30});
+
+    // Destination for output of mul.
+    Tensor out = tf_a.zeros({2, 2, 3, 5});
+    Tensor expected = tf_a.make(
+        {2, 2, 3, 5},
+        /*data=*/{0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+                  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+                  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+                  30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30});
+
+    // Check that it matches the expected output.
+    EXPECT_TENSOR_CLOSE(op_sub_out(a, b, 1.0, out), expected);
+    expected = tf_a.make(
+        {2, 2, 3, 5},
+        /*data=*/{0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+                  0,   0,   0,   -15, -15, -15, -15, -15, -15, -15, -15, -15,
+                  -15, -15, -15, -15, -15, -15, -15, -15, -15, -15, -15, -15,
+                  -15, -15, -15, -15, -15, -15, -15, -15, -15, -30, -30, -30,
+                  -30, -30, -30, -30, -30, -30, -30, -30, -30, -30, -30, -30});
+    EXPECT_TENSOR_CLOSE(op_sub_out(b, a, 1.0, out), expected);
+
+    b = tf_a.make(
+        {2, 2, 1, 5}, /*data=*/{1,  2,  3,  4,  5,  6,  7,  8,  9,  10,
+                                11, 12, 13, 14, 15, 16, 17, 18, 19, 20});
+    out = tf_a.zeros({2, 2, 3, 5});
+    expected = tf_a.make(
+        {2, 2, 3, 5},
+        /*data=*/{0,  0,  0,  0,  0,  5,  5,  5,  5,  5,  10, 10, 10, 10, 10,
+                  10, 10, 10, 10, 10, 15, 15, 15, 15, 15, 20, 20, 20, 20, 20,
+                  20, 20, 20, 20, 20, 25, 25, 25, 25, 25, 30, 30, 30, 30, 30,
+                  30, 30, 30, 30, 30, 35, 35, 35, 35, 35, 40, 40, 40, 40, 40});
+
+    // Check that it matches the expected output.
+    EXPECT_TENSOR_CLOSE(op_sub_out(a, b, 1.0, out), expected);
+    expected = tf_a.make(
+        {2, 2, 3, 5},
+        /*data=*/{-0.5000,  -1.0000,  -1.5000,  -2.0000,  -2.5000,
+                  -8.0000,  -8.5000,  -9.0000,  -9.5000,  -10.0000,
+                  -15.5000, -16.0000, -16.5000, -17.0000, -17.5000,
+
+                  -18.0000, -18.5000, -19.0000, -19.5000, -20.0000,
+                  -25.5000, -26.0000, -26.5000, -27.0000, -27.5000,
+                  -33.0000, -33.5000, -34.0000, -34.5000, -35.0000,
+
+                  -35.5000, -36.0000, -36.5000, -37.0000, -37.5000,
+                  -43.0000, -43.5000, -44.0000, -44.5000, -45.0000,
+                  -50.5000, -51.0000, -51.5000, -52.0000, -52.5000,
+
+                  -53.0000, -53.5000, -54.0000, -54.5000, -55.0000,
+                  -60.5000, -61.0000, -61.5000, -62.0000, -62.5000,
+                  -68.0000, -68.5000, -69.0000, -69.5000, -70.0000});
+    EXPECT_TENSOR_CLOSE(op_sub_out(b, a, 1.5, out), expected);
+  }
+
   void test_sub_enumerate_a_types() {
 #define ENUMERATE_TEST_ENTRY(ctype, dtype) \
   test_sub_enumerate_b_types<ScalarType::dtype>();
@@ -237,6 +340,19 @@ TEST_F(OpSubOutTest, BroadcastScalarRank0Supported) {
   EXPECT_TENSOR_EQ(out, ret);
 }
 
+TEST_F(OpSubOutTest, BroadcastNDTest) {
+  // Test 3D tensors
+  test_broadcast_3D<ScalarType::Float>();
+  test_broadcast_3D<ScalarType::Half>();
+  // Sub doesnt yet support BFloat16
+  // test_broadcast_3D<ScalarType::BFloat16>();
+
+  // Test 4D tensors
+  test_broadcast_4D<ScalarType::Float>();
+  test_broadcast_4D<ScalarType::Half>();
+  // test_broadcast_4D<ScalarType::BFloat16>();
+}
+
 //
 // Death Tests
 //
diff --git a/runtime/__init__.py b/runtime/__init__.py
index 4ed99ddae0..33999b716e 100644
--- a/runtime/__init__.py
+++ b/runtime/__init__.py
@@ -42,7 +42,7 @@
 import functools
 from pathlib import Path
 from types import ModuleType
-from typing import Any, BinaryIO, Dict, Optional, Sequence, Set, Union
+from typing import Any, BinaryIO, Dict, List, Optional, Sequence, Set, Union
 
 try:
     from executorch.extension.pybindings.portable_lib import (
@@ -125,6 +125,21 @@ def load_method(self, name: str) -> Optional[Method]:
         return self._methods.get(name, None)
 
 
+class BackendRegistry:
+    """The registry of backends that are available to the runtime."""
+
+    def __init__(self, legacy_module: ModuleType) -> None:
+        # TODO: Expose the kernel callables to Python.
+        self._legacy_module = legacy_module
+
+    @property
+    def registered_backend_names(self) -> List[str]:
+        """
+        Returns the names of all registered backends as a list of strings.
+        """
+        return self._legacy_module._get_registered_backend_names()
+
+
 class OperatorRegistry:
     """The registry of operators that are available to the runtime."""
 
@@ -157,6 +172,7 @@ def get() -> "Runtime":
 
     def __init__(self, *, legacy_module: ModuleType) -> None:
         # Public attributes.
+        self.backend_registry = BackendRegistry(legacy_module)
         self.operator_registry = OperatorRegistry(legacy_module)
         # Private attributes.
         self._legacy_module = legacy_module
diff --git a/runtime/backend/interface.cpp b/runtime/backend/interface.cpp
index 84c0bb82d4..4fb1eadfa8 100644
--- a/runtime/backend/interface.cpp
+++ b/runtime/backend/interface.cpp
@@ -55,5 +55,16 @@ Error register_backend(const Backend& backend) {
   return Error::Ok;
 }
 
+size_t get_num_registered_backends() {
+  return num_registered_backends;
+}
+
+Result<const char*> get_backend_name(size_t index) {
+  if (index >= num_registered_backends) {
+    return Error::InvalidArgument;
+  }
+  return registered_backends[index].name;
+}
+
 } // namespace runtime
 } // namespace executorch
diff --git a/runtime/backend/interface.h b/runtime/backend/interface.h
index c0305f68cd..b74858a9d9 100644
--- a/runtime/backend/interface.h
+++ b/runtime/backend/interface.h
@@ -139,6 +139,16 @@ struct Backend {
  */
 ET_NODISCARD Error register_backend(const Backend& backend);
 
+/**
+ * Returns the number of registered backends.
+ */
+size_t get_num_registered_backends();
+
+/**
+ * Returns the backend name at the given index.
+ */
+Result<const char*> get_backend_name(size_t index);
+
 } // namespace runtime
 } // namespace executorch
 
diff --git a/runtime/core/array_ref.h b/runtime/core/array_ref.h
index d02aac955c..a23509e869 100644
--- a/runtime/core/array_ref.h
+++ b/runtime/core/array_ref.h
@@ -29,6 +29,7 @@
 #include <array>
 #include <cstdint>
 
+#include <c10/util/irange.h>
 #include <executorch/runtime/platform/assert.h>
 
 namespace executorch {
@@ -149,7 +150,7 @@ class ArrayRef final {
     if (Length != RHS.Length) {
       return false;
     }
-    for (size_t i = 0; i < this->Length; i++) {
+    for (const auto i : c10::irange(this->Length)) {
       if (Data[i] != RHS.Data[i]) {
         return false;
       }
diff --git a/runtime/core/hierarchical_allocator.h b/runtime/core/hierarchical_allocator.h
index f2f5fd18fb..b5031fa38e 100644
--- a/runtime/core/hierarchical_allocator.h
+++ b/runtime/core/hierarchical_allocator.h
@@ -8,6 +8,7 @@
 
 #pragma once
 
+#include <c10/util/irange.h>
 #include <executorch/runtime/core/memory_allocator.h>
 #include <executorch/runtime/core/result.h>
 #include <executorch/runtime/core/span.h>
@@ -96,7 +97,7 @@ class HierarchicalAllocator final {
         "n_allocators %" PRIu32 " > %zu",
         n_allocators,
         kSpanArraySize);
-    for (uint32_t i = 0; i < n_allocators; ++i) {
+    for (const auto i : c10::irange(n_allocators)) {
       span_array_[i] =
           Span<uint8_t>(allocators[i].base_address(), allocators[i].size());
     }
diff --git a/runtime/core/portable_type/c10/README.md b/runtime/core/portable_type/c10/README.md
index df14d22a4c..104a6717ba 100644
--- a/runtime/core/portable_type/c10/README.md
+++ b/runtime/core/portable_type/c10/README.md
@@ -1,7 +1,13 @@
-We added an extra c10 directory so that runtime/core/portable_type/c10
+This directory contains header files from `c10` in PyTorch core that
+need to be used in ExecuTorch core. They are copied here rather than
+being found through the torch pip package to keep the core build
+hermetic for embedded use cases. The headers should be exact copies
+from PyTorch core; if they are out of sync, please send a PR!
+
+We added an extra c10 directory so that `runtime/core/portable_type/c10`
 can be the directory to put on your include path, rather than
-runtime/core/portable_type, because using runtime/core/portable_type
+`runtime/core/portable_type`, because using `runtime/core/portable_type`
 would cause all headers in that directory to be includeable with
 `#include <foo.h>`. In particular, that includes
-runtime/core/portable_type/complex.h, which would shadow the C99
-complex.h standard header.
+`runtime/core/portable_type/complex.h`, which would shadow the C99
+`complex.h` standard header.
diff --git a/runtime/core/portable_type/c10/c10/macros/Export.h b/runtime/core/portable_type/c10/c10/macros/Export.h
index cb68060ed8..21808de77a 100644
--- a/runtime/core/portable_type/c10/c10/macros/Export.h
+++ b/runtime/core/portable_type/c10/c10/macros/Export.h
@@ -139,8 +139,10 @@
 #endif
 
 #if defined(TORCH_HIP_BUILD_MAIN_LIB)
+#define TORCH_HIP_CPP_API C10_EXPORT
 #define TORCH_HIP_API C10_EXPORT
 #else
+#define TORCH_HIP_CPP_API C10_IMPORT
 #define TORCH_HIP_API C10_IMPORT
 #endif
 
diff --git a/runtime/core/portable_type/c10/c10/targets.bzl b/runtime/core/portable_type/c10/c10/targets.bzl
index 1e60b70a4b..2bde5eac5e 100644
--- a/runtime/core/portable_type/c10/c10/targets.bzl
+++ b/runtime/core/portable_type/c10/c10/targets.bzl
@@ -26,6 +26,7 @@ def define_common_targets():
             "util/TypeSafeSignMath.h",
             "util/bit_cast.h",
             "util/floating_point_utils.h",
+            "util/irange.h",
         ],
         exported_preprocessor_flags = [
             # NOTE: If we define C10_EMBEDDED to prevent Half and
@@ -44,7 +45,7 @@ def define_common_targets():
             "-DC10_USING_CUSTOM_GENERATED_MACROS",
         ],
         visibility = [
-            "//executorch/runtime/core/portable_type/...",
+            "//executorch/...",
         ],
         deps = select({
             "DEFAULT": [],
diff --git a/runtime/core/portable_type/c10/c10/util/BFloat16.h b/runtime/core/portable_type/c10/c10/util/BFloat16.h
index ad1271fc72..09d3051ab7 100644
--- a/runtime/core/portable_type/c10/c10/util/BFloat16.h
+++ b/runtime/core/portable_type/c10/c10/util/BFloat16.h
@@ -8,9 +8,7 @@
 #include <cstdint>
 #include <cstring>
 #include <iosfwd>
-#ifndef C10_EMBEDDED
 #include <ostream>
-#endif // C10_EMBEDDED
 
 #if defined(__CUDACC__) && !defined(USE_ROCM)
 #include <cuda_bf16.h>
@@ -116,14 +114,12 @@ struct alignas(2) BFloat16 {
 #endif
 };
 
-#ifndef C10_EMBEDDED
 C10_API inline std::ostream& operator<<(
     std::ostream& out,
     const BFloat16& value) {
   out << (float)value;
   return out;
 }
-#endif // C10_EMBEDDED
 
 } // namespace c10
 
diff --git a/runtime/core/portable_type/c10/c10/util/Half.h b/runtime/core/portable_type/c10/c10/util/Half.h
index 5625d4c340..b77cf7b1f4 100644
--- a/runtime/core/portable_type/c10/c10/util/Half.h
+++ b/runtime/core/portable_type/c10/c10/util/Half.h
@@ -29,9 +29,7 @@
 #include <cstring>
 #include <iosfwd>
 #include <limits>
-#ifndef C10_EMBEDDED
 #include <ostream>
-#endif // C10_EMBEDDED
 
 #ifdef __CUDACC__
 #include <cuda_fp16.h>
@@ -411,12 +409,10 @@ struct alignas(2) Half {
 #endif
 };
 
-#ifndef C10_EMBEDDED
 C10_API inline std::ostream& operator<<(std::ostream& out, const Half& value) {
   out << (float)value;
   return out;
 }
-#endif // C10_EMBEDDED
 
 } // namespace c10
 
diff --git a/runtime/core/portable_type/c10/c10/util/irange.h b/runtime/core/portable_type/c10/c10/util/irange.h
new file mode 100644
index 0000000000..2719a82075
--- /dev/null
+++ b/runtime/core/portable_type/c10/c10/util/irange.h
@@ -0,0 +1,123 @@
+// Copyright 2004-present Facebook. All Rights Reserved.
+
+#pragma once
+
+#include <c10/util/TypeSafeSignMath.h>
+
+#include <algorithm>
+#include <cstddef>
+#include <iterator>
+#include <type_traits>
+
+namespace c10 {
+
+namespace detail {
+
+template <
+    typename I,
+    bool one_sided = false,
+    std::enable_if_t<std::is_integral_v<I>, int> = 0>
+struct integer_iterator {
+  using iterator_category = std::input_iterator_tag;
+  using value_type = I;
+  using difference_type = std::ptrdiff_t;
+  using pointer = I*;
+  using reference = I&;
+
+  explicit integer_iterator(I value) : value(value) {}
+
+  I operator*() const {
+    return value;
+  }
+
+  I const* operator->() const {
+    return &value;
+  }
+
+  integer_iterator& operator++() {
+    ++value;
+    return *this;
+  }
+
+  integer_iterator operator++(int) {
+    const auto copy = *this;
+    ++*this;
+    return copy;
+  }
+
+  bool operator==(const integer_iterator& other) const {
+    if constexpr (one_sided) {
+      // Range-for loops' end test is `begin != end`, not `begin <
+      // end`. To handle `c10::irange(n)` where n < 0 (which should be
+      // empty), we just make `begin != end` fail whenever `end` is
+      // negative.
+      return is_negative(other.value) || value == other.value;
+    } else {
+      return value == other.value;
+    }
+    // Suppress "warning: missing return statement at end of non-void function"
+    // which Nvidia's Robert Crovella confirms is an NVCC compiler error
+    // here https://stackoverflow.com/a/64561686/752843 on 2020-10-27
+    // `__builtin_unreachable();` would be best here, but it's not
+    // available with all compilers. So we instead return an arbitrary
+    // value trusting that this line will, in fact, never be reached.
+    return false; // Horrible hack
+  }
+
+  bool operator!=(const integer_iterator& other) const {
+    return !(*this == other);
+  }
+
+ protected:
+  I value;
+};
+
+} // namespace detail
+
+template <
+    typename I,
+    bool one_sided = false,
+    std::enable_if_t<std::is_integral_v<I>, bool> = true>
+struct integer_range {
+ public:
+  integer_range(I begin, I end) : begin_(begin), end_(end) {}
+  using iterator = detail::integer_iterator<I, one_sided>;
+  iterator begin() const {
+    return begin_;
+  }
+  iterator end() const {
+    return end_;
+  }
+
+ private:
+  iterator begin_;
+  iterator end_;
+};
+
+/// Creates an integer range for the half-open interval [begin, end)
+/// If end<=begin, then the range is empty.
+/// The range has the type of the `end` integer; `begin` integer is
+/// cast to this type.
+template <
+    typename Integer1,
+    typename Integer2,
+    std::enable_if_t<std::is_integral_v<Integer1>, bool> = true,
+    std::enable_if_t<std::is_integral_v<Integer2>, bool> = true>
+integer_range<Integer2> irange(Integer1 begin, Integer2 end) {
+  // If end<=begin then the range is empty; we can achieve this effect by
+  // choosing the larger of {begin, end} as the loop terminator
+  return {
+      static_cast<Integer2>(begin),
+      std::max(static_cast<Integer2>(begin), end)};
+}
+
+/// Creates an integer range for the half-open interval [0, end)
+/// If end<=begin, then the range is empty
+template <
+    typename Integer,
+    std::enable_if_t<std::is_integral_v<Integer>, bool> = true>
+integer_range<Integer, true> irange(Integer end) {
+  return {Integer(), end};
+}
+
+} // namespace c10
diff --git a/runtime/core/portable_type/targets.bzl b/runtime/core/portable_type/targets.bzl
index 43efeca208..6178f2c0f9 100644
--- a/runtime/core/portable_type/targets.bzl
+++ b/runtime/core/portable_type/targets.bzl
@@ -28,6 +28,9 @@ def define_common_targets():
             "//executorch/runtime/core/exec_aten/...",
             "//executorch/runtime/core/portable_type/test/...",
         ],
+        deps = [
+            "//executorch/runtime/core/portable_type/c10/c10:c10",
+        ],
         exported_deps = [
             ":scalar_type",
             "//executorch/runtime/core:core",
diff --git a/runtime/core/portable_type/tensor_impl.cpp b/runtime/core/portable_type/tensor_impl.cpp
index b978e23cbd..6366a8eac2 100644
--- a/runtime/core/portable_type/tensor_impl.cpp
+++ b/runtime/core/portable_type/tensor_impl.cpp
@@ -11,6 +11,8 @@
 #include <algorithm>
 #include <cstdint>
 
+#include <c10/util/irange.h>
+
 #include <executorch/runtime/core/exec_aten/util/dim_order_util.h>
 #include <executorch/runtime/core/exec_aten/util/scalar_type_util.h>
 #include <executorch/runtime/core/exec_aten/util/tensor_shape_to_c_string.h>
@@ -30,7 +32,7 @@ ssize_t compute_numel(const TensorImpl::SizesType* sizes, ssize_t dim) {
       dim == 0 || sizes != nullptr,
       "Sizes must be provided for non-scalar tensors");
   ssize_t numel = 1; // Zero-dimensional tensors (scalars) have numel == 1.
-  for (ssize_t i = 0; i < dim; ++i) {
+  for (const auto i : c10::irange(dim)) {
     ET_CHECK_MSG(
         sizes[i] >= 0,
         "Size must be non-negative, got %d at dimension %zd",
diff --git a/runtime/core/portable_type/test/bfloat16_test.cpp b/runtime/core/portable_type/test/bfloat16_test.cpp
index 6b42a6e4a5..505f80e770 100644
--- a/runtime/core/portable_type/test/bfloat16_test.cpp
+++ b/runtime/core/portable_type/test/bfloat16_test.cpp
@@ -6,6 +6,7 @@
  * LICENSE file in the root directory of this source tree.
  */
 
+#include <c10/util/irange.h>
 #include <executorch/runtime/core/portable_type/bfloat16.h>
 
 #include <gtest/gtest.h>
@@ -41,7 +42,7 @@ uint16_t bits_from_f32(float src) {
 TEST(BFloat16Conversion, FloatToBFloat16AndBack) {
   // NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,cppcoreguidelines-avoid-magic-numbers,modernize-avoid-c-arrays)
   float in[100];
-  for (int i = 0; i < 100; ++i) {
+  for (const auto i : c10::irange(100)) {
     // NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions,cppcoreguidelines-avoid-magic-numbers)
     in[i] = i + 1.25;
   }
@@ -51,7 +52,7 @@ TEST(BFloat16Conversion, FloatToBFloat16AndBack) {
   // NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,cppcoreguidelines-avoid-magic-numbers,modernize-avoid-c-arrays)
   float out[100];
 
-  for (int i = 0; i < 100; ++i) {
+  for (const auto i : c10::irange(100)) {
     bfloats[i].x = bits_from_f32(in[i]);
     out[i] = f32_from_bits(bfloats[i].x);
 
@@ -64,7 +65,7 @@ TEST(BFloat16Conversion, FloatToBFloat16AndBack) {
 TEST(BFloat16Conversion, FloatToBFloat16RNEAndBack) {
   // NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,cppcoreguidelines-avoid-magic-numbers,modernize-avoid-c-arrays)
   float in[100];
-  for (int i = 0; i < 100; ++i) {
+  for (const auto i : c10::irange(100)) {
     // NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions,cppcoreguidelines-avoid-magic-numbers)
     in[i] = i + 1.25;
   }
@@ -74,7 +75,7 @@ TEST(BFloat16Conversion, FloatToBFloat16RNEAndBack) {
   // NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,cppcoreguidelines-avoid-magic-numbers,modernize-avoid-c-arrays)
   float out[100];
 
-  for (int i = 0; i < 100; ++i) {
+  for (const auto i : c10::irange(100)) {
     bfloats[i].x = round_to_nearest_even(in[i]);
     out[i] = f32_from_bits(bfloats[i].x);
 
diff --git a/runtime/core/portable_type/test/targets.bzl b/runtime/core/portable_type/test/targets.bzl
index c0b4ef00c7..d8e82a15fb 100644
--- a/runtime/core/portable_type/test/targets.bzl
+++ b/runtime/core/portable_type/test/targets.bzl
@@ -11,6 +11,7 @@ def define_common_targets():
         srcs = ["bfloat16_test.cpp"],
         deps = [
             "//executorch/runtime/core/portable_type:portable_type",
+            "//executorch/runtime/core/portable_type/c10/c10:c10",
         ],
     )
 
@@ -52,5 +53,6 @@ def define_common_targets():
         deps = [
             "//executorch/runtime/core/exec_aten/util:tensor_util",
             "//executorch/runtime/core/portable_type:portable_type",
+            "//executorch/runtime/core/portable_type/c10/c10:c10",
         ],
     )
diff --git a/runtime/core/portable_type/test/tensor_impl_test.cpp b/runtime/core/portable_type/test/tensor_impl_test.cpp
index bd5f82c5d1..0b8ae05f4d 100644
--- a/runtime/core/portable_type/test/tensor_impl_test.cpp
+++ b/runtime/core/portable_type/test/tensor_impl_test.cpp
@@ -6,6 +6,7 @@
  * LICENSE file in the root directory of this source tree.
  */
 
+#include <c10/util/irange.h>
 #include <executorch/runtime/core/portable_type/tensor_impl.h>
 
 #include <gtest/gtest.h>
@@ -76,7 +77,7 @@ TEST_F(TensorImplTest, TestSetSizesContigContract) {
 
   SizesType new_sizes[RANK] = {0, 0, 0, 0, 0};
   // assign random sizes between 1 and 100
-  for (int i = 0; i < RANK; i++) {
+  for (const auto i : c10::irange(RANK)) {
     new_sizes[i] = distribution(generator);
   }
   Error err = resize_tensor_impl(&t, {new_sizes, RANK});
diff --git a/runtime/core/result.h b/runtime/core/result.h
index 7b404bca94..377573e6df 100644
--- a/runtime/core/result.h
+++ b/runtime/core/result.h
@@ -59,8 +59,13 @@ class Result final {
    * a non-Ok value.
    */
   /* implicit */ Result(Error error)
-      : error_(error == Error::Ok ? Error::Internal : error),
-        hasValue_(false) {}
+      : error_(error == Error::Ok ? Error::Internal : error), hasValue_(false) {
+    if ET_UNLIKELY (error == Error::Ok) {
+      ET_LOG(
+          Debug,
+          "Attempted to create Result from Error::Ok, this has been converted to Error::Internal.");
+    }
+  }
 
   /// Value copy constructor.
   /* implicit */ Result(const T& val) : value_(val), hasValue_(true) {}
diff --git a/runtime/core/targets.bzl b/runtime/core/targets.bzl
index c3535688f6..d67312beda 100644
--- a/runtime/core/targets.bzl
+++ b/runtime/core/targets.bzl
@@ -50,6 +50,7 @@ def define_common_targets():
         ],
         exported_preprocessor_flags = get_core_flags(),
         exported_deps = [
+            "//executorch/runtime/core/portable_type/c10/c10:c10",
             "//executorch/runtime/platform:platform",
         ],
     )
@@ -73,6 +74,7 @@ def define_common_targets():
         ],
         exported_deps = [
             ":core",
+            "//executorch/runtime/core/portable_type/c10/c10:c10",
         ],
         visibility = [
             "//executorch/...",
@@ -145,13 +147,16 @@ def define_common_targets():
             ":tensor_layout",
         ],
     )
-    
+
     runtime.cxx_library(
         name = "tensor_layout",
         srcs = ["tensor_layout.cpp"],
         exported_headers = ["tensor_layout.h"],
+        deps = [
+            "//executorch/runtime/core/portable_type/c10/c10:c10",
+        ],
         exported_deps = [
-            ":core", 
+            ":core",
             "//executorch/runtime/core/exec_aten:lib",
         ],
         visibility = ["//executorch/..."],
diff --git a/runtime/core/tensor_layout.cpp b/runtime/core/tensor_layout.cpp
index 748a43fc5d..f0fac442e2 100644
--- a/runtime/core/tensor_layout.cpp
+++ b/runtime/core/tensor_layout.cpp
@@ -6,6 +6,7 @@
  * LICENSE file in the root directory of this source tree.
  */
 
+#include <c10/util/irange.h>
 #include <executorch/runtime/core/exec_aten/exec_aten.h>
 #include <executorch/runtime/core/exec_aten/util/scalar_type_util.h>
 #include <executorch/runtime/core/span.h>
@@ -43,7 +44,7 @@ Result<const TensorLayout> TensorLayout::create(
     return Error::InvalidArgument;
   }
 
-  for (size_t i = 0; i < dim_order.size(); i++) {
+  for (const auto i : c10::irange(dim_order.size())) {
     if (dim_order[i] >= sizes.size()) {
       return Error::InvalidArgument;
     }
diff --git a/runtime/core/test/error_handling_test.cpp b/runtime/core/test/error_handling_test.cpp
index b6b5862398..ef270cad1e 100644
--- a/runtime/core/test/error_handling_test.cpp
+++ b/runtime/core/test/error_handling_test.cpp
@@ -110,6 +110,7 @@ TEST(ErrorHandlingTest, ResultBasic) {
 }
 
 TEST(ErrorHandlingTest, OkErrorNotPossible) {
+  executorch::runtime::runtime_init();
   Result<uint32_t> r(Error::Ok);
   ASSERT_FALSE(r.ok());
   ASSERT_NE(r.error(), Error::Ok);
diff --git a/runtime/core/test/event_tracer_test.cpp b/runtime/core/test/event_tracer_test.cpp
index 622de1ff9f..9591d9c06e 100644
--- a/runtime/core/test/event_tracer_test.cpp
+++ b/runtime/core/test/event_tracer_test.cpp
@@ -8,6 +8,7 @@
 
 #include <gtest/gtest.h>
 
+#include <c10/util/irange.h>
 #include <executorch/runtime/core/array_ref.h>
 #include <executorch/runtime/core/evalue.h>
 #include <executorch/runtime/core/event_tracer.h>
@@ -207,7 +208,7 @@ TEST(TestEventTracer, SimpleEventTracerTest) {
   // and also with a null pointer (to test that the null case works).
   DummyEventTracer dummy;
   std::vector<DummyEventTracer*> dummy_event_tracer_arr = {&dummy, nullptr};
-  for (size_t i = 0; i < dummy_event_tracer_arr.size(); i++) {
+  for (const auto i : c10::irange(dummy_event_tracer_arr.size())) {
     RunSimpleTracerTest(&dummy);
     RunSimpleTracerTest(nullptr);
   }
@@ -234,7 +235,7 @@ TEST(TestEventTracer, SimpleEventTracerTestDelegate) {
   // and also with a null pointer (to test that the null case works).
   DummyEventTracer dummy;
   std::vector<DummyEventTracer*> dummy_event_tracer_arr = {&dummy, nullptr};
-  for (size_t i = 0; i < dummy_event_tracer_arr.size(); i++) {
+  for (const auto i : c10::irange(dummy_event_tracer_arr.size())) {
     RunSimpleTracerTestDelegate(&dummy);
     RunSimpleTracerTestDelegate(nullptr);
   }
diff --git a/runtime/core/test/memory_allocator_test.cpp b/runtime/core/test/memory_allocator_test.cpp
index dfd2f23a48..f0fa44ae6e 100644
--- a/runtime/core/test/memory_allocator_test.cpp
+++ b/runtime/core/test/memory_allocator_test.cpp
@@ -9,6 +9,7 @@
 #include <array>
 #include <vector>
 
+#include <c10/util/irange.h>
 #include <executorch/runtime/core/memory_allocator.h>
 #include <executorch/runtime/platform/runtime.h>
 #include <executorch/test/utils/alignment.h>
@@ -62,12 +63,12 @@ TEST_F(MemoryAllocatorTest, MemoryAllocatorAlignment) {
       128,
       2};
 
-  for (int i = 0; i < arr_size; i++) {
+  for (const auto i : c10::irange(arr_size)) {
     auto align_size = alignment[i];
     constexpr size_t mem_size = 1000;
     uint8_t mem_pool[mem_size];
     MemoryAllocator allocator = MemoryAllocator(mem_size, mem_pool);
-    for (int j = 0; j < arr_size; j++) {
+    for (const auto j : c10::irange(arr_size)) {
       auto size = allocation[j];
       void* start = allocator.allocate(size, align_size);
       EXPECT_ALIGNED(start, align_size);
@@ -81,7 +82,7 @@ TEST_F(MemoryAllocatorTest, MemoryAllocatorNonPowerOfTwoAlignment) {
   MemoryAllocator allocator(mem_size, mem_pool);
 
   size_t alignment[5] = {0, 5, 6, 12, 34};
-  for (int i = 0; i < 5; i++) {
+  for (const auto i : c10::irange(5)) {
     ASSERT_EQ(nullptr, allocator.allocate(8, alignment[i]));
   }
 }
diff --git a/runtime/core/test/targets.bzl b/runtime/core/test/targets.bzl
index 7332aad8a3..abe52bcadf 100644
--- a/runtime/core/test/targets.bzl
+++ b/runtime/core/test/targets.bzl
@@ -40,6 +40,7 @@ def define_common_targets():
         ],
         deps = [
             "//executorch/runtime/core:event_tracer",
+            "//executorch/runtime/core/portable_type/c10/c10:c10",
         ],
     )
 
@@ -68,6 +69,7 @@ def define_common_targets():
         ],
         deps = [
             "//executorch/runtime/core:memory_allocator",
+            "//executorch/runtime/core/portable_type/c10/c10:c10",
         ],
     )
 
diff --git a/runtime/executor/method.cpp b/runtime/executor/method.cpp
index d435678ca2..0857bc1c97 100644
--- a/runtime/executor/method.cpp
+++ b/runtime/executor/method.cpp
@@ -33,6 +33,7 @@
 namespace executorch {
 namespace runtime {
 
+using deserialization::NamedData;
 using internal::PlatformMemoryAllocator;
 
 /**
@@ -289,6 +290,113 @@ Result<bool> parse_cond_value(const EValue& cond_value) {
 
 } // namespace
 
+Result<size_t> Method::get_num_external_constants() {
+  auto flatbuffer_values = serialization_plan_->values();
+  size_t n_value = flatbuffer_values->size();
+
+  size_t n_external_constants = 0;
+  for (size_t i = 0; i < n_value; ++i) {
+    auto serialization_value = flatbuffer_values->Get(i);
+    // Ensure values are non-null.
+    // Note that as a side-effect of this check, we're guaranteed that all
+    // values are non-null, so later loops can skip that check.
+    ET_CHECK_OR_RETURN_ERROR(
+        serialization_value != nullptr &&
+            (serialization_value->val_type() ==
+                 executorch_flatbuffer::KernelTypes::Null ||
+             serialization_value->val() != nullptr),
+        InvalidProgram,
+        "Null value at index %" ET_PRIsize_t,
+        i);
+    // Ignore non-tensor types.
+    if (serialization_value->val_type() !=
+        executorch_flatbuffer::KernelTypes::Tensor) {
+      continue;
+    }
+    const auto s_tensor = static_cast<const executorch_flatbuffer::Tensor*>(
+        serialization_value->val());
+
+    // An external constant is tagged with EXTERNAL and has no
+    // allocation_info.
+    if (s_tensor->extra_tensor_info() != nullptr &&
+        s_tensor->extra_tensor_info()->location() ==
+            executorch_flatbuffer::TensorDataLocation::EXTERNAL &&
+        s_tensor->allocation_info() == nullptr) {
+      n_external_constants++;
+    }
+  }
+  return n_external_constants;
+}
+
+Error Method::parse_external_constants(const NamedDataMap* named_data_map) {
+  auto flatbuffer_values = serialization_plan_->values();
+  size_t n_value = flatbuffer_values->size();
+
+  // n_external_constants_ counts the number of successfully-initialized
+  // external constants for ~Method() to clean up, and is incremented at the
+  // bottom of the loop. This makes it safe for errors to return without
+  // updating any state.
+  n_external_constants_ = 0;
+  for (size_t i = 0; i < n_value; ++i) {
+    auto serialization_value = flatbuffer_values->Get(i);
+    // Ignore non-tensor types.
+    if (serialization_value->val_type() !=
+        executorch_flatbuffer::KernelTypes::Tensor) {
+      continue;
+    }
+    const auto s_tensor = static_cast<const executorch_flatbuffer::Tensor*>(
+        serialization_value->val());
+    // Constant tensors are resolved here; tensors with allocation_info are
+    // mutable and are resolved in parse_values.
+    if (s_tensor->extra_tensor_info() == nullptr ||
+        s_tensor->extra_tensor_info()->location() !=
+            executorch_flatbuffer::TensorDataLocation::EXTERNAL ||
+        s_tensor->allocation_info() != nullptr) {
+      continue;
+    }
+    ET_CHECK_OR_RETURN_ERROR(
+        s_tensor->extra_tensor_info()->fully_qualified_name() != nullptr,
+        InvalidExternalData,
+        "Fully qualified name of external tensor is null at index %zu",
+        i);
+
+    const char* key =
+        s_tensor->extra_tensor_info()->fully_qualified_name()->c_str();
+
+    // Check if this tensor has already been resolved.
+    if (get_data_by_key(
+            key, Span<NamedData>(external_constants_, n_external_constants_)) !=
+        nullptr) {
+      continue;
+    }
+    Result<const TensorLayout> tensor_layout =
+        named_data_map->get_metadata(key);
+    if (!tensor_layout.ok()) {
+      return tensor_layout.error();
+    }
+    // Check external tensor compatibility.
+    Error err =
+        deserialization::validateTensorLayout(s_tensor, tensor_layout.get());
+    if (err != Error::Ok) {
+      return err;
+    }
+    // Save the key.
+    external_constants_[n_external_constants_].key = key;
+
+    // Save the buffer.
+    Result<FreeableBuffer> buffer = named_data_map->get_data(key);
+    ET_CHECK_OR_RETURN_ERROR(
+        buffer.ok(),
+        InvalidExternalData,
+        "Buffer retrieved from get_data is not valid");
+    new (&external_constants_[n_external_constants_].buffer)
+        FreeableBuffer(std::move(buffer.get()));
+
+    n_external_constants_ += 1;
+  }
+  return Error::Ok;
+}
+
 Error Method::parse_values(const NamedDataMap* named_data_map) {
   auto flatbuffer_values = serialization_plan_->values();
   ET_CHECK_OR_RETURN_ERROR(
@@ -299,6 +407,30 @@ Error Method::parse_values(const NamedDataMap* named_data_map) {
     return Error::MemoryAllocationFailed;
   }
 
+  // Count the number of tensors marked as EXTERNAL for this method. The actual
+  // number of external constants may be smaller, eg. if multiple tensors point
+  // to the same underlying data buffer.
+  // This function also ensures that all flatbuffer_values entries
+  // are non-null, so `val_as_X()` calls below are guaranteed to return
+  // non-null pointers.
+  Result<size_t> max_external_constants = get_num_external_constants();
+  if (!max_external_constants.ok()) {
+    return max_external_constants.error();
+  }
+  if (max_external_constants.get() > 0) {
+    // Allocate space for external tensors.
+    external_constants_ =
+        memory_manager_->method_allocator()->allocateList<NamedData>(
+            max_external_constants.get());
+    if (external_constants_ == nullptr) {
+      return Error::MemoryAllocationFailed;
+    }
+    Error err = parse_external_constants(named_data_map);
+    if (err != Error::Ok) {
+      return err;
+    }
+  }
+
   // n_value_ counts the number of successfully-initialized values for ~Method()
   // to clean up, and is incremented at the bottom of the loop. This makes it
   // safe for errors to return without updating any state.
@@ -306,16 +438,6 @@ Error Method::parse_values(const NamedDataMap* named_data_map) {
 
   for (size_t i = 0; i < n_value; ++i) {
     auto serialization_value = flatbuffer_values->Get(i);
-    // Ensure that the `val_as_X()` calls will return non-null pointers.
-    ET_CHECK_OR_RETURN_ERROR(
-        serialization_value != nullptr &&
-            (serialization_value->val_type() ==
-                 executorch_flatbuffer::KernelTypes::Null ||
-             serialization_value->val() != nullptr),
-        InvalidProgram,
-        "Null value at index %" ET_PRIsize_t,
-        i);
-
     const auto val = serialization_value->val();
 
     switch (serialization_value->val_type()) {
@@ -416,7 +538,8 @@ Error Method::parse_values(const NamedDataMap* named_data_map) {
             program_,
             memory_manager_,
             static_cast<const executorch_flatbuffer::Tensor*>(val),
-            named_data_map);
+            named_data_map,
+            Span<NamedData>(external_constants_, n_external_constants_));
         if (!t.ok()) {
           ET_LOG(
               Error,
@@ -1496,6 +1619,10 @@ Method::~Method() {
       delegates_[i].~BackendDelegate();
     }
   }
+  // Free resources associated with external constants.
+  for (int i = 0; i < n_external_constants_; i++) {
+    external_constants_[i].buffer.~FreeableBuffer();
+  }
   // All other fields are trivially destructible.
 }
 } // namespace runtime
diff --git a/runtime/executor/method.h b/runtime/executor/method.h
index dff4e818f9..4108db8810 100644
--- a/runtime/executor/method.h
+++ b/runtime/executor/method.h
@@ -31,6 +31,12 @@ struct EValue;
 namespace executorch {
 namespace runtime {
 
+// Forward declare NamedData. This is a public header and must not include
+// internal data types.
+namespace deserialization {
+struct NamedData;
+} // namespace deserialization
+
 // Forward declare Program to avoid a circular reference.
 class Program;
 
@@ -42,6 +48,7 @@ using OpFunction = void (*)(KernelRuntimeContext&, EValue**);
 /// A list of pointers into the master values table that together compose the
 /// argument list for a single instruction
 using InstructionArgs = Span<EValue*>;
+using deserialization::NamedData;
 
 /**
  * An executable method of an executorch program. Maps to a python method like
@@ -66,6 +73,8 @@ class Method final {
         delegates_(rhs.delegates_),
         n_chains_(rhs.n_chains_),
         chains_(rhs.chains_),
+        external_constants_(rhs.external_constants_),
+        n_external_constants_(rhs.n_external_constants_),
         init_state_(rhs.init_state_) {
     // Required: clear out fields that the dtor looks at, so that we don't free
     // anything twice.
@@ -73,6 +82,8 @@ class Method final {
     rhs.values_ = nullptr;
     rhs.n_delegate_ = 0;
     rhs.delegates_ = nullptr;
+    rhs.n_external_constants_ = 0;
+    rhs.external_constants_ = nullptr;
 
     // Helpful: Try to ensure that any other interactions with the old object
     // result in failures.
@@ -288,6 +299,8 @@ class Method final {
         delegates_(nullptr),
         n_chains_(0),
         chains_(nullptr),
+        external_constants_(nullptr),
+        n_external_constants_(0),
         init_state_(InitializationState::Uninitialized) {}
 
   /// Static factory used by Program.
@@ -336,8 +349,31 @@ class Method final {
   size_t n_chains_;
   Chain* chains_;
 
+  NamedData* external_constants_;
+  size_t n_external_constants_ = 0;
+
   InitializationState init_state_;
 
+  /**
+   * Counts the number of tensors marked as EXTERNAL in the flatbuffer
+   * for this method.
+   */
+  ET_NODISCARD Result<size_t> get_num_external_constants();
+
+  /**
+   * Parses the flatbuffer for constant tensors tagged as EXTERNAL.
+   * Retrieves the external constants using the named_data_map and places them
+   * into `external_constants_`. Updates `n_external_constants_` to count the
+   * number of successfully-initialized external constants.
+   * FreeableBuffers returned by the named_data_map are owned by the
+   * method and are freed on method destruction.
+   *
+   * @param[in] named_data_map, to retrieve external constants from.
+   * @returns Error::Ok on success, non-Ok on failure.
+   */
+  ET_NODISCARD Error
+  parse_external_constants(const NamedDataMap* named_data_map);
+
   /**
    * Parses the elements of the values_ array. On error, n_value_ will be set to
    * the number of successfully-initialized entries so that ~Method doesn't try
diff --git a/runtime/executor/tensor_parser.h b/runtime/executor/tensor_parser.h
index 2ffb473544..cfd711713a 100644
--- a/runtime/executor/tensor_parser.h
+++ b/runtime/executor/tensor_parser.h
@@ -21,11 +21,21 @@ namespace executorch {
 namespace runtime {
 namespace deserialization {
 
+/// Data structure to hold key and data buffer for external data used
+/// in a method.
+struct NamedData {
+  const char* key;
+  FreeableBuffer buffer;
+};
+
+NamedData* get_data_by_key(const char* key, Span<NamedData> entries);
+
 ET_NODISCARD Result<executorch::aten::Tensor> parseTensor(
     const Program* program,
     MemoryManager* memory_manager,
     const executorch_flatbuffer::Tensor* s_tensor,
-    const NamedDataMap* named_data_map = nullptr);
+    const NamedDataMap* named_data_map = nullptr,
+    Span<NamedData> external_constants = {});
 
 ET_NODISCARD Result<BoxedEvalueList<executorch::aten::Tensor>> parseTensorList(
     const flatbuffers::Vector<int32_t>* tensor_indices,
@@ -33,6 +43,12 @@ ET_NODISCARD Result<BoxedEvalueList<executorch::aten::Tensor>> parseTensorList(
     size_t values_len,
     MemoryManager* memory_manager);
 
+// Checks that the sizes, dim_order and scalar_type match between tensors
+// stored in the PTE and externally.
+ET_NODISCARD Error validateTensorLayout(
+    const executorch_flatbuffer::Tensor* s_tensor,
+    const TensorLayout& expected_layout);
+
 // Deserializes a List of optional type. The code here is the same between all
 // list of optionals: list of optional Tensor, list of optional float etc, so we
 // just use a template to avoid boilerplate.
@@ -105,7 +121,11 @@ parseListOptionalType(
  * @param[in] nbytes The amount of memory to get from the allocator.
  * @param[in] allocator The source of memory for non-constant tensors.
  * @param[in] named_data_map An optional map of {name, blob} used to resolve
- *     data that is external to the PTE, if any.
+ *     data that is mutable and external to the PTE, if any.
+ * @param[in] external_constants An optional span containing tensor fqn to
+ *     corresponding tensor data. Used to resolve data that is constant and
+ *     external to the PTE, if any. Referencing data from external_constants is
+ *     safe, as it has the same lifetime as the method.
  *
  * @returns On success, the data pointer to use for the tensor. On failure, a
  *     non-Ok Error.
@@ -115,7 +135,8 @@ ET_NODISCARD Result<void*> getTensorDataPtr(
     const Program* program,
     size_t nbytes,
     HierarchicalAllocator* allocator,
-    const NamedDataMap* named_data_map = nullptr);
+    const NamedDataMap* named_data_map = nullptr,
+    Span<NamedData> external_constants = {});
 
 } // namespace deserialization
 } // namespace runtime
diff --git a/runtime/executor/tensor_parser_aten.cpp b/runtime/executor/tensor_parser_aten.cpp
index ab9af3d039..d1a2f71285 100644
--- a/runtime/executor/tensor_parser_aten.cpp
+++ b/runtime/executor/tensor_parser_aten.cpp
@@ -33,7 +33,8 @@ Result<at::Tensor> parseTensor(
     const Program* program,
     MemoryManager* memory_manager,
     const executorch_flatbuffer::Tensor* s_tensor,
-    const NamedDataMap* named_data_map) {
+    const NamedDataMap* named_data_map,
+    Span<NamedData> external_constants) {
   EXECUTORCH_SCOPE_PROF("TensorParser::parseTensor");
 
   ET_CHECK_OR_RETURN_ERROR(
@@ -108,7 +109,8 @@ Result<at::Tensor> parseTensor(
         program,
         tensor.nbytes(),
         memory_manager->planned_memory(),
-        named_data_map);
+        named_data_map,
+        external_constants);
     if (!data_ptr.ok()) {
       ET_LOG(
           Error,
diff --git a/runtime/executor/tensor_parser_exec_aten.cpp b/runtime/executor/tensor_parser_exec_aten.cpp
index 83310ff680..a1ac245acc 100644
--- a/runtime/executor/tensor_parser_exec_aten.cpp
+++ b/runtime/executor/tensor_parser_exec_aten.cpp
@@ -111,12 +111,60 @@ ET_NODISCARD Result<BoxedEvalueList<executorch::aten::Tensor>> parseTensorList(
       evalp_list, tensor_list, tensor_indices->size());
 }
 
+ET_NODISCARD Error validateTensorLayout(
+    const executorch_flatbuffer::Tensor* s_tensor,
+    const TensorLayout& expected_layout) {
+  ET_CHECK_OR_RETURN_ERROR(
+      static_cast<ScalarType>(s_tensor->scalar_type()) ==
+          expected_layout.scalar_type(),
+      InvalidExternalData,
+      "Scalar type mismatch. Expected %hhd, got %hhd.",
+      static_cast<int8_t>(s_tensor->scalar_type()),
+      static_cast<int8_t>(expected_layout.scalar_type()));
+  int dim = s_tensor->sizes()->size();
+  ET_CHECK_OR_RETURN_ERROR(
+      dim == expected_layout.sizes().size(),
+      InvalidExternalData,
+      "Dim mismatch. Expected %d, got %zu.",
+      dim,
+      expected_layout.sizes().size());
+  for (int i = 0; i < dim; i++) {
+    ET_CHECK_OR_RETURN_ERROR(
+        s_tensor->sizes()->Get(i) == expected_layout.sizes()[i],
+        InvalidExternalData,
+        "Sizes mismatch. Expected %d, got %d for size at index %d.",
+        s_tensor->sizes()->Get(i),
+        expected_layout.sizes()[i],
+        i);
+    ET_CHECK_OR_RETURN_ERROR(
+        s_tensor->dim_order()->Get(i) == expected_layout.dim_order()[i],
+        InvalidExternalData,
+        "Dim order mismatch. Expected %d, got %d for dim at index %d.",
+        s_tensor->dim_order()->Get(i),
+        expected_layout.dim_order()[i],
+        i);
+  }
+  return Error::Ok;
+}
+
+// Check if key exists in entries. If it does, return a pointer to the entry
+// otherwise return a nullptr.
+NamedData* get_data_by_key(const char* key, Span<NamedData> entries) {
+  for (int i = 0; i < entries.size(); i++) {
+    if (strcmp(key, entries[i].key) == 0) {
+      return &entries[i];
+    }
+  }
+  return nullptr;
+}
+
 ET_NODISCARD Result<void*> getTensorDataPtr(
     const executorch_flatbuffer::Tensor* s_tensor,
     const Program* program,
     size_t nbytes,
     HierarchicalAllocator* allocator,
-    const NamedDataMap* named_data_map) {
+    const NamedDataMap* named_data_map,
+    Span<NamedData> external_constants) {
   auto data_buffer_idx = s_tensor->data_buffer_idx();
   const executorch_flatbuffer::AllocationDetails* allocation_info =
       s_tensor->allocation_info();
@@ -146,76 +194,38 @@ ET_NODISCARD Result<void*> getTensorDataPtr(
         s_tensor->extra_tensor_info()->fully_qualified_name() != nullptr,
         InvalidExternalData,
         "Fully qualified name of external tensor is null");
-    // Look up tensor in named data map.
-    Result<const TensorLayout> tensor_layout_res = named_data_map->get_metadata(
-        s_tensor->extra_tensor_info()->fully_qualified_name()->c_str());
-    if (!tensor_layout_res.ok()) {
-      return tensor_layout_res.error();
-    }
-    const TensorLayout& tensor_layout = tensor_layout_res.get();
-
-    // Compatibility checking.
-    ET_CHECK_OR_RETURN_ERROR(
-        static_cast<ScalarType>(s_tensor->scalar_type()) ==
-            tensor_layout.scalar_type(),
-        InvalidExternalData,
-        "Scalar type mismatch. Expected %hhd, got %hhd.",
-        static_cast<int8_t>(s_tensor->scalar_type()),
-        static_cast<int8_t>(tensor_layout.scalar_type()));
-    ET_CHECK_OR_RETURN_ERROR(
-        nbytes == tensor_layout.nbytes(),
-        InvalidExternalData,
-        "Nbytes mismatch. Expected %zu, got %zu.",
-        nbytes,
-        tensor_layout.nbytes());
-    int dim = s_tensor->sizes()->size();
-    ET_CHECK_OR_RETURN_ERROR(
-        dim == tensor_layout.sizes().size(),
-        InvalidExternalData,
-        "Dim mismatch. Expected %d, got %zu.",
-        dim,
-        tensor_layout.sizes().size());
-    for (int i = 0; i < dim; i++) {
-      ET_CHECK_OR_RETURN_ERROR(
-          s_tensor->sizes()->Get(i) == tensor_layout.sizes()[i],
-          InvalidExternalData,
-          "Sizes mismatch. Expected %d, got %d for size at index %d.",
-          s_tensor->sizes()->Get(i),
-          tensor_layout.sizes()[i],
-          i);
-      ET_CHECK_OR_RETURN_ERROR(
-          s_tensor->dim_order()->Get(i) == tensor_layout.dim_order()[i],
-          InvalidExternalData,
-          "Dim order mismatch. Expected %d, got %d for dim at index %d.",
-          s_tensor->dim_order()->Get(i),
-          tensor_layout.dim_order()[i],
-          i);
-    }
+    const char* fqn =
+        s_tensor->extra_tensor_info()->fully_qualified_name()->c_str();
 
     // Constant value.
     if (allocation_info == nullptr) {
-      Result<FreeableBuffer> data_res = named_data_map->get_data(
-          s_tensor->extra_tensor_info()->fully_qualified_name()->c_str());
-      if (!data_res.ok()) {
-        return data_res.error();
+      NamedData* data = get_data_by_key(fqn, external_constants);
+      if (data != nullptr) {
+        return const_cast<void*>(data->buffer.data());
+      }
+      // Should never reach here; these tensors are resolved in
+      // Method::parse_external_constants. Any errors should be caught there.
+      return Error::Internal;
+    } else {
+      // Mutable value.
+      // Look up tensor in named data map.
+      Result<const TensorLayout> tensor_layout_res =
+          named_data_map->get_metadata(fqn);
+      if (!tensor_layout_res.ok()) {
+        return tensor_layout_res.error();
+      }
+      const TensorLayout& tensor_layout = tensor_layout_res.get();
+      Error err = validateTensorLayout(s_tensor, tensor_layout);
+      if (err != Error::Ok) {
+        return err;
       }
-      // The const_cast is 'ok' here because program and runtime should
-      // guarantee that this data is never modified. Temporary until runtime
-      // takes ownership of FreeableBuffers in TODO(T214294528).
-      return const_cast<void*>(data_res.get().data());
-    }
-
-    // Mutable value.
-    else {
       // Call load_into.
       auto planned_ptr = getMemPlannedPtr(allocation_info, nbytes, allocator);
       if (!planned_ptr.ok()) {
         return planned_ptr.error();
       }
-      auto size = named_data_map->load_data_into(
-          s_tensor->extra_tensor_info()->fully_qualified_name()->c_str(),
-          planned_ptr.get(),
-          nbytes);
+      auto size =
+          named_data_map->load_data_into(fqn, planned_ptr.get(), nbytes);
       if (size.error() != Error::Ok) {
         return size.error();
       }
diff --git a/runtime/executor/tensor_parser_portable.cpp b/runtime/executor/tensor_parser_portable.cpp
index a53295470f..3a29c86700 100644
--- a/runtime/executor/tensor_parser_portable.cpp
+++ b/runtime/executor/tensor_parser_portable.cpp
@@ -21,6 +21,7 @@ namespace executorch {
 namespace runtime {
 namespace deserialization {
 
+using executorch::runtime::Span;
 using torch::executor::ScalarType;
 using torch::executor::Tensor;
 using torch::executor::TensorImpl;
@@ -29,7 +30,8 @@ Result<Tensor> parseTensor(
     const Program* program,
     MemoryManager* memory_manager,
     const executorch_flatbuffer::Tensor* s_tensor,
-    const NamedDataMap* named_data_map) {
+    const NamedDataMap* named_data_map,
+    Span<NamedData> external_constants) {
   EXECUTORCH_SCOPE_PROF("TensorParser::parseTensor");
   auto method_allocator = memory_manager->method_allocator();
 
@@ -149,7 +151,8 @@ Result<Tensor> parseTensor(
       program,
       tensor_impl->nbytes(),
       memory_manager->planned_memory(),
-      named_data_map);
+      named_data_map,
+      external_constants);
   if (!data_ptr.ok()) {
     ET_LOG(
         Error,
diff --git a/runtime/kernel/targets.bzl b/runtime/kernel/targets.bzl
index ed013260a9..e02c1288e5 100644
--- a/runtime/kernel/targets.bzl
+++ b/runtime/kernel/targets.bzl
@@ -8,6 +8,7 @@ def _operator_registry_preprocessor_flags():
         return select({
             "DEFAULT": [],
             "fbsource//xplat/executorch/build/constraints:executorch-max-kernel-num-256": ["-DMAX_KERNEL_NUM=256"],
+            "fbsource//xplat/executorch/build/constraints:executorch-max-kernel-num-128": ["-DMAX_KERNEL_NUM=128"],
             "fbsource//xplat/executorch/build/constraints:executorch-max-kernel-num-64": ["-DMAX_KERNEL_NUM=64"],
         })
     else:
diff --git a/shim/xplat/executorch/build/runtime_wrapper.bzl b/shim/xplat/executorch/build/runtime_wrapper.bzl
index 03bca6623f..b81aabcd83 100644
--- a/shim/xplat/executorch/build/runtime_wrapper.bzl
+++ b/shim/xplat/executorch/build/runtime_wrapper.bzl
@@ -171,7 +171,7 @@ def _patch_kwargs_common(kwargs):
     # don't pick up unexpected clients while things are still in flux.
     if not kwargs.pop("_is_external_target", False):
         for target in kwargs.get("visibility", []):
-            if not (target.startswith("//executorch") or target.startswith("@")):
+            if not (target.startswith("//executorch") or target.startswith("//pytorch/tokenizers") or target.startswith("@")):
                 fail("Please manage all external visibility using the " +
                      "EXECUTORCH_CLIENTS list in " +
                      "//executorch/build/fb/clients.bzl. " +
diff --git a/test/utils/targets.bzl b/test/utils/targets.bzl
index b16ce2bac2..93e33daf81 100644
--- a/test/utils/targets.bzl
+++ b/test/utils/targets.bzl
@@ -21,6 +21,7 @@ def define_common_targets():
             ],
             visibility = [
                 "//executorch/...",
+                "//pytorch/tokenizers/...",
                 "@EXECUTORCH_CLIENTS",
             ],
             deps = [