diff --git a/.ci/docker/ci_commit_pins/pytorch.txt b/.ci/docker/ci_commit_pins/pytorch.txt
index 9182b03d38..2c2d910da9 100644
--- a/.ci/docker/ci_commit_pins/pytorch.txt
+++ b/.ci/docker/ci_commit_pins/pytorch.txt
@@ -1 +1 @@
-0a94bb432ed75cc2d950d81b2921363218a7e459
+27e35de6c288bffad1b4d18b393579c1d1a95547
diff --git a/.ci/docker/conda-env-ci.txt b/.ci/docker/conda-env-ci.txt
index 8f2e65dae7..c675b3d9f6 100644
--- a/.ci/docker/conda-env-ci.txt
+++ b/.ci/docker/conda-env-ci.txt
@@ -1,4 +1,5 @@
 cmake=3.22.1
 ninja=1.10.2
 libuv
+llvm-openmp
 pkg-config
diff --git a/.ci/scripts/setup-macos.sh b/.ci/scripts/setup-macos.sh
index 395f0c1767..75f999af41 100755
--- a/.ci/scripts/setup-macos.sh
+++ b/.ci/scripts/setup-macos.sh
@@ -121,6 +121,7 @@ setup_macos_env_variables
 # NB: we need buck2 in all cases because cmake build also depends on calling
 # buck2 atm
 install_buck
+brew install libomp
 install_pip_dependencies
 
 # TODO(huydhn): Unlike our self-hosted runner, GitHub runner doesn't have access
diff --git a/.ci/scripts/test_eval_llama_mmlu.sh b/.ci/scripts/test_eval_llama_mmlu.sh
index c3c0a3d1a6..2f4cf1b3b3 100644
--- a/.ci/scripts/test_eval_llama_mmlu.sh
+++ b/.ci/scripts/test_eval_llama_mmlu.sh
@@ -43,6 +43,7 @@ run_and_verify() {
 	--tasks mmlu \
 	-f 5 \
 	--max_seq_length 2048 \
+	--max_context_length 2048 \
 	--limit 5 > result.txt
 
     # Verify result.txt
diff --git a/.ci/scripts/test_eval_llama_wikitext.sh b/.ci/scripts/test_eval_llama_wikitext.sh
index 77af12270c..8c1713ae12 100644
--- a/.ci/scripts/test_eval_llama_wikitext.sh
+++ b/.ci/scripts/test_eval_llama_wikitext.sh
@@ -41,6 +41,7 @@ run_and_verify() {
 	-kv \
 	-d fp32 \
 	--max_seq_length 2048 \
+	--max_context_length 2048 \
 	--limit 5 > result.txt
 
     # Verify result.txt
diff --git a/.github/workflows/_android.yml b/.github/workflows/_android.yml
index 96fdfd51fe..36b679eda4 100644
--- a/.github/workflows/_android.yml
+++ b/.github/workflows/_android.yml
@@ -7,7 +7,10 @@ on:
 jobs:
   build-llm-demo:
     name: build-llm-demo
-    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    permissions:
+      id-token: write
+      contents: read
     with:
       runner: linux.2xlarge
       docker-image: executorch-ubuntu-22.04-clang12-android
diff --git a/.github/workflows/_unittest.yml b/.github/workflows/_unittest.yml
index 74ea5ca7bc..414f86494b 100644
--- a/.github/workflows/_unittest.yml
+++ b/.github/workflows/_unittest.yml
@@ -14,7 +14,10 @@ on:
 
 jobs:
   linux:
-    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    permissions:
+      id-token: write
+      contents: read
     with:
       runner: linux.2xlarge
       docker-image: ${{ inputs.docker-image }}
diff --git a/.github/workflows/android-perf.yml b/.github/workflows/android-perf.yml
index 5d34bd8626..a83d374ab0 100644
--- a/.github/workflows/android-perf.yml
+++ b/.github/workflows/android-perf.yml
@@ -155,7 +155,10 @@ jobs:
 
   export-models:
     name: export-models
-    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    permissions:
+      id-token: write
+      contents: read
     needs: set-parameters
     secrets: inherit
     strategy:
@@ -332,7 +335,10 @@ jobs:
 
   build-benchmark-app:
     name: build-benchmark-app
-    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    permissions:
+      id-token: write
+      contents: read
     needs: set-parameters
     with:
       runner: linux.2xlarge
diff --git a/.github/workflows/android-release-artifacts.yml b/.github/workflows/android-release-artifacts.yml
index a10de79363..d204e121ff 100644
--- a/.github/workflows/android-release-artifacts.yml
+++ b/.github/workflows/android-release-artifacts.yml
@@ -31,7 +31,10 @@ jobs:
   build-aar:
     name: build-aar
     needs: check-if-aar-exists
-    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    permissions:
+      id-token: write
+      contents: read
     with:
       runner: linux.2xlarge
       docker-image: executorch-ubuntu-22.04-clang12-android
diff --git a/.github/workflows/apple.yml b/.github/workflows/apple.yml
index 8ac755bf5d..8349ddb419 100644
--- a/.github/workflows/apple.yml
+++ b/.github/workflows/apple.yml
@@ -37,7 +37,7 @@ jobs:
         id: set_version
         shell: bash
         run: |
-          VERSION="0.4.0.$(TZ='PST8PDT' date +%Y%m%d)"
+          VERSION="0.5.0.$(TZ='PST8PDT' date +%Y%m%d)"
           echo "version=$VERSION" >> "$GITHUB_OUTPUT"
 
   build-demo-ios:
diff --git a/.github/workflows/doc-build.yml b/.github/workflows/doc-build.yml
index 7a3b862b21..8d9081615b 100644
--- a/.github/workflows/doc-build.yml
+++ b/.github/workflows/doc-build.yml
@@ -15,7 +15,10 @@ on:
 
 jobs:
   build:
-    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    permissions:
+      id-token: write
+      contents: read
     strategy:
       matrix:
         include:
@@ -81,8 +84,9 @@ jobs:
     needs: build
     if: github.repository == 'pytorch/executorch' && github.event_name == 'push' && (github.ref == 'refs/heads/main' || startsWith(github.ref, 'refs/tags/v'))
     permissions:
+      id-token: write
       contents: write
-    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
     with:
       repository: pytorch/executorch
       download-artifact: docs
diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
index 93c89355d7..aab68b3059 100644
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@@ -16,7 +16,10 @@ concurrency:
 
 jobs:
   lintrunner:
-    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    permissions:
+      id-token: write
+      contents: read
     with:
       runner: linux.2xlarge
       docker-image: executorch-ubuntu-22.04-linter
@@ -62,7 +65,10 @@ jobs:
         exit $RC
 
   android-java-format:
-    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    permissions:
+      id-token: write
+      contents: read
     with:
       runner: linux.2xlarge
       docker-image: executorch-ubuntu-22.04-linter
diff --git a/.github/workflows/periodic.yml b/.github/workflows/periodic.yml
index df13140ca9..6b4644bb52 100644
--- a/.github/workflows/periodic.yml
+++ b/.github/workflows/periodic.yml
@@ -39,7 +39,10 @@ jobs:
 
   test-models-linux:
     name: test-models-linux
-    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    permissions:
+      id-token: write
+      contents: read
     needs: gather-models
     strategy:
       matrix: ${{ fromJSON(needs.gather-models.outputs.models) }}
diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml
index dbe0e872ac..16611c09f3 100644
--- a/.github/workflows/pull.yml
+++ b/.github/workflows/pull.yml
@@ -33,7 +33,10 @@ jobs:
 
   test-setup-linux-gcc:
     name: test-setup-linux-gcc
-    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    permissions:
+      id-token: write
+      contents: read
     strategy:
       fail-fast: false
     with:
@@ -55,7 +58,10 @@ jobs:
 
   test-models-linux:
     name: test-models-linux
-    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    permissions:
+      id-token: write
+      contents: read
     needs: gather-models
     strategy:
       matrix: ${{ fromJSON(needs.gather-models.outputs.models) }}
@@ -82,7 +88,10 @@ jobs:
 
   test-llama-runner-linux:
     name: test-llama-runner-linux
-    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    permissions:
+      id-token: write
+      contents: read
     strategy:
       matrix:
         dtype: [fp32]
@@ -121,7 +130,10 @@ jobs:
 
   test-llama-runner-linux-android:
     name: test-llama-runner-linux-android
-    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    permissions:
+      id-token: write
+      contents: read
     strategy:
       fail-fast: false
     with:
@@ -141,7 +153,10 @@ jobs:
 
   test-custom-ops-linux:
     name: test-custom-ops-linux
-    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    permissions:
+      id-token: write
+      contents: read
     strategy:
       fail-fast: false
     with:
@@ -162,7 +177,10 @@ jobs:
 
   test-selective-build-linux:
     name: test-selective-build-linux
-    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    permissions:
+      id-token: write
+      contents: read
     strategy:
       fail-fast: false
     with:
@@ -183,7 +201,10 @@ jobs:
 
   test-llava-runner-linux:
     name: test-llava-runner-linux
-    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    permissions:
+      id-token: write
+      contents: read
     strategy:
       fail-fast: false
     with:
@@ -191,7 +212,7 @@ jobs:
       docker-image: executorch-ubuntu-22.04-clang12
       submodules: 'true'
       ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-      timeout: 90
+      timeout: 180
       script: |
         # The generic Linux job chooses to use base env, not the one setup by the image
         CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
@@ -200,7 +221,7 @@ jobs:
         PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh "cmake"
 
         # install pybind
-        bash install_executorch.sh --pybind xnnpack
+        bash install_executorch.sh --pybind xnnpack --use-pt-pinned-commit
 
         # install Llava requirements
         bash examples/models/llama/install_requirements.sh
@@ -214,7 +235,10 @@ jobs:
 
   test-quantized-aot-lib-linux:
     name: test-quantized-aot-lib-linux
-    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    permissions:
+      id-token: write
+      contents: read
     strategy:
       fail-fast: false
     with:
@@ -234,7 +258,10 @@ jobs:
 
   test-pybind-build-linux:
     name: test-pybind-build-linux
-    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    permissions:
+      id-token: write
+      contents: read
     strategy:
       fail-fast: false
     with:
@@ -260,7 +287,10 @@ jobs:
 
   test-binary-size-linux-gcc:
     name: test-binary-size-linux-gcc
-    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    permissions:
+      id-token: write
+      contents: read
     strategy:
       fail-fast: false
     with:
@@ -292,7 +322,10 @@ jobs:
 
   test-binary-size-linux:
     name: test-binary-size-linux
-    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    permissions:
+      id-token: write
+      contents: read
     strategy:
       fail-fast: false
     with:
@@ -324,10 +357,16 @@ jobs:
 
   android:
     uses: ./.github/workflows/_android.yml
+    permissions:
+      id-token: write
+      contents: read
     needs: test-llama-runner-linux
 
   unittest:
     uses: ./.github/workflows/_unittest.yml
+    permissions:
+      id-token: write
+      contents: read
     with:
       docker-image: executorch-ubuntu-22.04-clang12
 
@@ -365,7 +404,10 @@ jobs:
 
   test-llama-runner-qnn-linux:
     name: test-llama-runner-qnn-linux
-    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    permissions:
+      id-token: write
+      contents: read
     strategy:
       matrix:
         dtype: [fp32]
@@ -400,7 +442,10 @@ jobs:
 
   test-qnn-models-linux:
     name: test-qnn-models-linux
-    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    permissions:
+      id-token: write
+      contents: read
     strategy:
       fail-fast: false
     with:
@@ -419,7 +464,10 @@ jobs:
 
   test-phi-3-mini-runner-linux:
     name: test-phi-3-mini-runner-linux
-    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    permissions:
+      id-token: write
+      contents: read
     strategy:
       fail-fast: false
     with:
@@ -436,7 +484,7 @@ jobs:
         PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh "cmake"
 
         # install pybind
-        bash install_executorch.sh --pybind xnnpack
+        bash install_executorch.sh --pybind xnnpack --use-pt-pinned-commit
 
         # install phi-3-mini requirements
         bash examples/models/phi-3-mini/install_requirements.sh
@@ -446,7 +494,10 @@ jobs:
 
   test-eval_llama-wikitext-linux:
     name: test-eval_llama-wikitext-linux
-    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    permissions:
+      id-token: write
+      contents: read
     strategy:
       fail-fast: false
     with:
@@ -463,7 +514,7 @@ jobs:
         PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh "cmake"
 
         # install pybind
-        bash install_executorch.sh --pybind xnnpack
+        bash install_executorch.sh --pybind xnnpack --use-pt-pinned-commit
 
         # install llama requirements
         bash examples/models/llama/install_requirements.sh
@@ -473,7 +524,10 @@ jobs:
 
   test-eval_llama-mmlu-linux:
     name: test-eval_llama-mmlu-linux
-    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    permissions:
+      id-token: write
+      contents: read
     strategy:
       fail-fast: false
     with:
@@ -481,7 +535,7 @@ jobs:
       docker-image: executorch-ubuntu-22.04-clang12
       submodules: 'true'
       ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-      timeout: 90
+      timeout: 180
       script: |
         # The generic Linux job chooses to use base env, not the one setup by the image
         CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
@@ -490,7 +544,7 @@ jobs:
         PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh "cmake"
 
         # install pybind
-        bash install_executorch.sh --pybind xnnpack
+        bash install_executorch.sh --pybind xnnpack --use-pt-pinned-commit
 
         # install llama requirements
         bash examples/models/llama/install_requirements.sh
@@ -500,7 +554,10 @@ jobs:
 
   test-llama_runner_eager-linux:
     name: test-llama_runner_eager-linux
-    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    permissions:
+      id-token: write
+      contents: read
     strategy:
       fail-fast: false
     with:
@@ -517,7 +574,7 @@ jobs:
         PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh "cmake"
 
         # install pybind
-        bash install_executorch.sh --pybind xnnpack
+        bash install_executorch.sh --pybind xnnpack --use-pt-pinned-commit
 
         # install llama requirements
         bash examples/models/llama/install_requirements.sh
@@ -527,7 +584,10 @@ jobs:
 
   test-mediatek-models-linux:
     name: test-mediatek-models-linux
-    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    permissions:
+      id-token: write
+      contents: read
     strategy:
       fail-fast: false
     with:
diff --git a/.github/workflows/trunk.yml b/.github/workflows/trunk.yml
index 0cbbe6f643..04a6c96f3e 100644
--- a/.github/workflows/trunk.yml
+++ b/.github/workflows/trunk.yml
@@ -107,7 +107,10 @@ jobs:
 
   test-demo-backend-delegation:
     name: test-demo-backend-delegation
-    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    permissions:
+      id-token: write
+      contents: read
     strategy:
       matrix:
         include:
@@ -147,7 +150,7 @@ jobs:
         conda activate "${CONDA_ENV}"
 
         source .ci/scripts/utils.sh
-        install_executorch
+        install_executorch "use-pt-pinned-commit"
 
         .ci/scripts/setup-arm-baremetal-tools.sh
 
@@ -177,7 +180,7 @@ jobs:
         conda activate "${CONDA_ENV}"
 
         source .ci/scripts/utils.sh
-        install_executorch
+        install_executorch "use-pt-pinned-commit"
 
         .ci/scripts/setup-arm-baremetal-tools.sh
 
@@ -301,7 +304,10 @@ jobs:
 
   test-qnn-model:
     name: test-qnn-model
-    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    permissions:
+      id-token: write
+      contents: read
     strategy:
       matrix:
         dtype: [fp32]
@@ -361,7 +367,10 @@ jobs:
     # NB: Don't run this on fork PRs because they won't have access to the secret and would fail anyway
     if: ${{ !github.event.pull_request.head.repo.fork }}
     name: test-huggingface-transformers
-    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    permissions:
+      id-token: write
+      contents: read
     secrets: inherit
     strategy:
       matrix:
@@ -445,7 +454,10 @@ jobs:
 
   test-llama-runner-qnn-linux:
     name: test-llama-runner-qnn-linux
-    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    permissions:
+      id-token: write
+      contents: read
     strategy:
       matrix:
         dtype: [fp32]
diff --git a/.lintrunner.toml b/.lintrunner.toml
index dd75ea8f32..093f9cdbcb 100644
--- a/.lintrunner.toml
+++ b/.lintrunner.toml
@@ -1,4 +1,4 @@
-merge_base_with = "origin/main"
+merge_base_with = "main"
 
 [[linter]]
 code = 'FLAKE8'
@@ -291,6 +291,7 @@ code = 'MYPY'
 include_patterns = [
     # TODO(https://github.com/pytorch/executorch/issues/7441): Gradually start enabling all folders.
     # 'backends/**/*.py',
+    'backends/arm/**/*.py',
     'build/**/*.py',
     'codegen/**/*.py',
     # 'devtools/**/*.py',
@@ -312,6 +313,7 @@ exclude_patterns = [
     '**/third-party/**',
     'scripts/check_binary_dependencies.py',
     'profiler/test/test_profiler_e2e.py',
+    'backends/arm/test/**',
 ]
 command = [
     'python',
diff --git a/.mypy.ini b/.mypy.ini
index 43d75e64de..8c1c9dbcad 100644
--- a/.mypy.ini
+++ b/.mypy.ini
@@ -77,6 +77,9 @@ ignore_missing_imports = True
 [mypy-ruamel]
 ignore_missing_imports = True
 
+[mypy-serializer.*]
+ignore_missing_imports = True
+
 [mypy-setuptools.*]
 ignore_missing_imports = True
 
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 4c1b8e2ec7..ca8d1bbbcf 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,4 +1,5 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
+# Copyright 2024-2025 Arm Limited and/or its affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the BSD-style license found in the
@@ -239,6 +240,13 @@ cmake_dependent_option(
   "NOT EXECUTORCH_BUILD_ARM_BAREMETAL" OFF
 )
 
+
+if(EXECUTORCH_BUILD_EXTENSION_TRAINING)
+  set(EXECUTORCH_BUILD_EXTENSION_TENSOR ON)
+  set(EXECUTORCH_BUILD_EXTENSION_DATA_LOADER ON)
+  set(EXECUTORCH_BUILD_EXTENSION_MODULE ON)
+endif()
+
 if(EXECUTORCH_BUILD_KERNELS_CUSTOM_AOT)
   set(EXECUTORCH_BUILD_EXTENSION_TENSOR ON)
   set(EXECUTORCH_BUILD_KERNELS_CUSTOM ON)
@@ -790,6 +798,35 @@ if(EXECUTORCH_BUILD_PYBIND)
   install(TARGETS portable_lib
           LIBRARY DESTINATION executorch/extension/pybindings
   )
+
+  if(EXECUTORCH_BUILD_EXTENSION_TRAINING)
+
+    set(_pybind_training_dep_libs
+        ${TORCH_PYTHON_LIBRARY}
+        etdump
+        executorch
+        util
+        torch
+        extension_training
+    )
+
+    if(EXECUTORCH_BUILD_XNNPACK)
+      # need to explicitly specify XNNPACK and microkernels-prod
+      # here otherwise uses XNNPACK and microkernel-prod symbols from libtorch_cpu
+      list(APPEND _pybind_training_dep_libs xnnpack_backend XNNPACK microkernels-prod)
+    endif()
+
+    # pybind training
+    pybind11_add_module(_training_lib SHARED extension/training/pybindings/_training_lib.cpp)
+
+    target_include_directories(_training_lib PRIVATE ${TORCH_INCLUDE_DIRS})
+    target_compile_options(_training_lib PUBLIC ${_pybind_compile_options})
+    target_link_libraries(_training_lib PRIVATE ${_pybind_training_dep_libs})
+
+    install(TARGETS _training_lib
+            LIBRARY DESTINATION executorch/extension/training/pybindings
+    )
+  endif()
 endif()
 
 if(EXECUTORCH_BUILD_KERNELS_CUSTOM)
@@ -819,6 +856,14 @@ if(EXECUTORCH_BUILD_EXECUTOR_RUNNER)
     list(APPEND _executor_runner_libs quantized_ops_lib)
   endif()
 
+  if(EXECUTORCH_ENABLE_EVENT_TRACER)
+    if(EXECUTORCH_BUILD_DEVTOOLS)
+      list(APPEND _executor_runner_libs etdump flatccrt)
+    else()
+      message(SEND_ERROR "Use of 'EXECUTORCH_ENABLE_EVENT_TRACER' requires 'EXECUTORCH_BUILD_DEVTOOLS' to be enabled.")
+    endif()
+  endif()
+
   add_executable(executor_runner ${_executor_runner__srcs})
   if(CMAKE_BUILD_TYPE STREQUAL "Release")
     if(APPLE)
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index bd943c587b..88f55ef73c 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -44,6 +44,38 @@ Meta has a [bounty program](https://www.facebook.com/whitehat/) for the safe
 disclosure of security bugs. In those cases, please go through the process
 outlined on that page and do not file a public issue.
 
+### Issue Labels
+
+#### Module/Partner Labels
+
+[Labels beginning with `module:`](https://github.com/pytorch/executorch/labels?q=%22module%3A+%22)
+indicate the area that the issue relates to. The ExecuTorch oncall will
+typically add this label.
+
+[Labels beginning with `partner:`](https://github.com/pytorch/executorch/labels?q=%22partner%3A+%22)
+indicate the ExecuTorch partner who owns the issue. The ExecuTorch oncall will
+typically add this label.
+
+#### Lifecycle Labels
+
+The ExecuTorch oncall will triage new issues. If the issue requires more
+information from the issue's author, oncall will add the `need-user-input` label
+and wait for the author to respond.
+
+Once the issue contains enough information, the oncall will:
+- Ensure that the title is descriptive
+- Add one of the labels:
+  - `bug`: The issue describes an unexpected problem
+  - `feature`: The issue describes a request for new functionality
+  - `rfc`: The issue describes a proposed change to functionality
+- Add one `module:` label or one `partner:` label, as described above
+- Add the `triaged` label
+
+After this point, the oncall has finished the triage process, and the
+module owner or partner is responsible for resolving the issue. (See
+https://github.com/pytorch/executorch/issues/7679 for the mapping of labels to
+owners.)
+
 ### Claiming Issues
 We'd love your help closing out [open
 issues](https://github.com/pytorch/executorch/issues?q=sort%3Aupdated-desc+is%3Aissue+is%3Aopen)
diff --git a/README-wheel.md b/README-wheel.md
index e04e6dfa6d..9f074ab5ee 100644
--- a/README-wheel.md
+++ b/README-wheel.md
@@ -4,20 +4,21 @@ standard on-device iOS and Android mobile deployments. One of the main goals for
 ExecuTorch is to enable wider customization and deployment capabilities of the
 PyTorch programs.
 
-The `executorch` pip package is in alpha.
-* Supported python versions: 3.10, 3.11
+The `executorch` pip package is in beta.
+* Supported python versions: 3.10, 3.11, 3.12
 * Compatible systems: Linux x86_64, macOS aarch64
 
-The prebuilt `executorch.extension.pybindings.portable_lib` module included in
-this package provides a way to run ExecuTorch `.pte` files, with some
-restrictions:
+The prebuilt `executorch.runtime` module included in this package provides a way
+to run ExecuTorch `.pte` files, with some restrictions:
 * Only [core ATen
   operators](https://pytorch.org/executorch/stable/ir-ops-set-definition.html)
   are linked into the prebuilt module
 * Only the [XNNPACK backend
   delegate](https://pytorch.org/executorch/main/native-delegates-executorch-xnnpack-delegate.html)
-  is linked into the prebuilt module
-* [macOS only] [Core ML](https://pytorch.org/executorch/main/build-run-coreml.html) and [MPS](https://pytorch.org/executorch/main/build-run-mps.html) backend delegates are linked into the prebuilt module.
+  is linked into the prebuilt module.
+* \[macOS only] [Core ML](https://pytorch.org/executorch/main/build-run-coreml.html)
+  and [MPS](https://pytorch.org/executorch/main/build-run-mps.html) backend
+  delegates are also linked into the prebuilt module.
 
 Please visit the [ExecuTorch website](https://pytorch.org/executorch/) for
 tutorials and documentation. Here are some starting points:
diff --git a/README.md b/README.md
index aded66bf40..3a2a833e05 100644
--- a/README.md
+++ b/README.md
@@ -1,9 +1,37 @@
-# ExecuTorch
-
-**ExecuTorch** is an end-to-end solution for enabling on-device inference
-capabilities across mobile and edge devices including wearables, embedded
-devices and microcontrollers. It is part of the PyTorch Edge ecosystem and
-enables efficient deployment of PyTorch models to edge devices.
+<div align="center">
+  <img src="./docs/source/_static/img/et-logo.png" alt="Logo" width="200">
+  <h1 align="center">ExecuTorch: A powerful on-device AI Framework</h1>
+</div>
+
+
+<div align="center">
+  <a href="https://github.com/pytorch/executorch/graphs/contributors"><img src="https://img.shields.io/github/contributors/pytorch/executorch?style=for-the-badge&color=blue" alt="Contributors"></a>
+  <a href="https://github.com/pytorch/executorch/stargazers"><img src="https://img.shields.io/github/stars/pytorch/executorch?style=for-the-badge&color=blue" alt="Stargazers"></a>
+  <a href="https://discord.gg/MeacgB7A"><img src="https://img.shields.io/badge/Discord-Join%20Us-purple?logo=discord&logoColor=white&style=for-the-badge" alt="Join our Discord community"></a>
+  <a href="https://pytorch.org/executorch/stable/index.html"><img src="https://img.shields.io/badge/Documentation-000?logo=googledocs&logoColor=FFE165&style=for-the-badge" alt="Check out the documentation"></a>
+  <hr>
+</div>
+
+**ExecuTorch** is an end-to-end solution for on-device inference and training. It powers much of Meta's on-device AI experiences across Facebook, Instagram, Meta Quest, Ray-Ban Meta Smart Glasses, WhatsApp, and more.
+
+It supports a wide range of models including LLMs (Large Language Models), CV (Computer Vision), ASR (Automatic Speech Recognition), and TTS (Text to Speech).
+
+Platform Support:
+- Operating Systems:
+  - iOS
+  - Mac
+  - Android
+  - Linux
+  - Microcontrollers
+
+- Hardware Acceleration:
+  - Apple
+  - Arm
+  - Cadence
+  - MediaTek
+  - Qualcomm
+  - Vulkan
+  - XNNPACK
 
 Key value propositions of ExecuTorch are:
 
@@ -17,35 +45,21 @@ Key value propositions of ExecuTorch are:
   experience due to a lightweight runtime and utilizing full hardware
   capabilities such as CPUs, NPUs, and DSPs.
 
-For a comprehensive technical overview of ExecuTorch and step-by-step tutorials,
-please visit our documentation website [for the latest release](https://pytorch.org/executorch/stable/index.html) (or the [main branch](https://pytorch.org/executorch/main/index.html)).
-
-Check out the [Getting Started](https://pytorch.org/executorch/stable/getting-started-setup.html#quick-setup-colab-jupyter-notebook-prototype) page for a quick spin.
-
-Check out the examples of [Llama](./examples/models/llama/README.md), [Llava](./examples/models/llava/README.md) and [other models](./examples/README.md) running on edge devices using ExecuTorch.
+## Getting Started
+To get started you can:
 
+- Visit the [Step by Step Tutorial](https://pytorch.org/executorch/main/index.html) on getting things running locally and deploy a model to a device
+- Use this [Colab Notebook](https://pytorch.org/executorch/stable/getting-started-setup.html#quick-setup-colab-jupyter-notebook-prototype) to start playing around right away
+- Jump straight into LLMs use cases by following specific instructions for [Llama](./examples/models/llama/README.md) and [Llava](./examples/models/llava/README.md)
 
-**[UPDATE - 10/24]** We have added support for running [Llama 3.2 Quantized 1B/3B](./examples/models/llama/README.md) models via ExecuTorch.
-
-## Feedback
+## Feedback and Engagement
 
 We welcome any feedback, suggestions, and bug reports from the community to help
-us improve our technology. Please use the [PyTorch
-Forums](https://discuss.pytorch.org/c/executorch) for discussion and feedback
-about ExecuTorch using the **ExecuTorch** category, and our [GitHub
-repository](https://github.com/pytorch/executorch/issues) for bug reporting.
-
-We recommend using the latest release tag from the
-[Releases](https://github.com/pytorch/executorch/releases) page when developing.
+us improve our technology. Check out the [Discussion Board](https://github.com/pytorch/executorch/discussions) or chat real time with us on [Discord](https://discord.gg/MeacgB7A)
 
 ## Contributing
 
-See [CONTRIBUTING.md](CONTRIBUTING.md) for details about issues, PRs, code
-style, CI jobs, and other development topics.
-
-To connect with us and other community members, we invite you to join PyTorch Slack community by filling out this [form](https://docs.google.com/forms/d/e/1FAIpQLSeADnUNW36fjKjYzyHDOzEB_abKQE9b6gqqW9NXse6O0MWh0A/viewform). Once you've joined, you can:
-* Head to the `#executorch-general` channel for general questions, discussion, and community support.
-* Join the `#executorch-contributors` channel if you're interested in contributing directly to project development.
+We welcome contributions. To get started review the [guidelines](CONTRIBUTING.md) and chat with us on [Discord](https://discord.gg/MeacgB7A)
 
 
 ## Directory Structure
diff --git a/backends/arm/README.md b/backends/arm/README.md
index 2079e8ddd8..e28559fb90 100644
--- a/backends/arm/README.md
+++ b/backends/arm/README.md
@@ -122,6 +122,18 @@ The you can run the tests with
 pytest -c /dev/null -v -n auto backends/arm/test --arm_run_corstoneFVP
 ```
 
+## Passes
+
+With the default passes in the Arm Ethos-U backend, assuming the model lowers fully to the
+Ethos-U, the exported program is composed of a Quantize node, Ethos-U custom delegate
+and a Dequantize node. In some circumstances, you may want to feed quantized input to the Neural
+Network straight away, e.g. if you have a camera sensor outputting (u)int8 data and keep all the
+arithmetic of the application in the int8 domain. For these cases, you can apply the
+`exir/passes/quantize_io_pass.py`. See the unit test in `executorch/backends/arm/
+test/passes/test_ioquantization_pass.py`for an example how to feed quantized inputs and
+obtain quantized outputs.
+
+
 ### Code coverage
 
 To get code coverage:
diff --git a/backends/arm/_passes/annotate_channels_last_dim_order_pass.py b/backends/arm/_passes/annotate_channels_last_dim_order_pass.py
index a3d168fb87..ce15d8298c 100644
--- a/backends/arm/_passes/annotate_channels_last_dim_order_pass.py
+++ b/backends/arm/_passes/annotate_channels_last_dim_order_pass.py
@@ -116,7 +116,7 @@ def insert_input_transpose(node, input_node, graph_module):
         with graph_module.graph.inserting_before(node):
             permute_node = create_node(
                 graph_module.graph,
-                torch.ops.passthrough_to_tosa._transpose,
+                torch.ops.passthrough_to_tosa._transpose.default,
                 args=(
                     input_node,
                     list(AnnotateChannelsLastDimOrder.NHWC_inverse_order),
@@ -129,18 +129,22 @@ def insert_input_transpose(node, input_node, graph_module):
             permute_node.meta["tosa_dim_order"] = tuple(
                 range(len(input_node.meta["val"].size()))
             )
+            permute_node.meta["val"] = input_node.meta["val"]
 
     @staticmethod
     def insert_output_transpose(node, graph_module):
         with graph_module.graph.inserting_after(node):
             permute_node = create_node(
                 graph_module.graph,
-                torch.ops.passthrough_to_tosa._transpose,
+                torch.ops.passthrough_to_tosa._transpose.default,
                 args=(node, list(AnnotateChannelsLastDimOrder.NHWC_order)),
             )
             permute_node.meta["tosa_dim_order"] = (
                 AnnotateChannelsLastDimOrder.NHWC_order
             )
+            permute_node.meta["val"] = node.meta["val"].permute(
+                AnnotateChannelsLastDimOrder.NHWC_order
+            )
             node.meta["tosa_dim_order"] = (0, 1, 2, 3)
             users = [user for user in node.users if user != permute_node]
             for user in users:
@@ -209,7 +213,7 @@ def call(self, graph_module: torch.fx.GraphModule):
                     # dim_order = (2, 3, 0, 1) (https://www.mlplatform.org/tosa/tosa_spec.html#_depthwise_conv2d).
                     dim_order = self.HWCM_order
             else:
-                dim_order = tuple(range(node_data.dim()))
+                dim_order = tuple(range(node_data.dim()))  # type: ignore[assignment]
             node.meta["tosa_dim_order"] = dim_order
         # Take care of cases when:
         # 4D (NHWC) -> >4D (NCH)
diff --git a/backends/arm/_passes/annotate_decomposed_matmul.py b/backends/arm/_passes/annotate_decomposed_matmul.py
index 0846d97372..3feb0a0e05 100644
--- a/backends/arm/_passes/annotate_decomposed_matmul.py
+++ b/backends/arm/_passes/annotate_decomposed_matmul.py
@@ -6,9 +6,12 @@
 
 import itertools
 
+from typing import List
+
 import torch
 from executorch.backends.arm._passes.arm_pass_utils import create_node
-from executorch.backends.arm.tosa_quant_utils import dq_op, q_op
+
+from executorch.backends.arm.tosa_quant_utils import dq_op, q_op, QuantArgs
 from executorch.exir.dialects._ops import ops as exir_ops
 from executorch.exir.pass_base import ExportPass, PassResult
 from torch.fx import GraphModule
@@ -24,6 +27,22 @@ class AnnotateDecomposedMatmulPass(ExportPass):
     matmul-op (can be mm or bmm).
     """
 
+    def _match_partition_to_node(
+        self, node: torch.fx.Node, partitioned_inputs: List[torch.fx.Node]
+    ) -> torch.fx.Node:
+        """
+        The partition.input_nodes order is not guaranteed. Compare these
+        with the matmul node inputs coming in and return the nodes
+        in the correct order.
+        """
+        if not node or node in partitioned_inputs or node.op == "placeholder":
+            return node
+        else:
+            return self._match_partition_to_node(
+                node.all_input_nodes[0], partitioned_inputs
+            )
+        raise RuntimeError(f"Cannot find an input node which matches, {node}.")
+
     def call(self, graph_module: GraphModule) -> PassResult:
         matmul_partitions = get_source_partitions(
             graph_module.graph,
@@ -45,28 +64,36 @@ def call(self, graph_module: GraphModule) -> PassResult:
             matmul_node = [
                 node for node in partition.nodes if node.target in matmul_targets
             ][0]
+
             if quantized_input:
                 matmul_args = matmul_node.all_input_nodes
-                for i in range(len(matmul_args)):
-                    input_node = partition.input_nodes[i]
-                    matmul_input_node = matmul_args[i]
+                for node in matmul_args:
+                    input_node = self._match_partition_to_node(
+                        node, partition.input_nodes
+                    )
+
                     # Remove partition input dq-node
                     input_node.replace_all_uses_with(input_node.all_input_nodes[0])
                     graph_module.graph.erase_node(input_node)
-                    input_node_qargs = input_node.args[1:]
+                    input_node_qargs = QuantArgs.from_operator(
+                        input_node.target, input_node.args
+                    )
+
                     with graph_module.graph.inserting_before(matmul_node):
                         # Create new dq-node before matmul
                         dq_node = create_node(
                             graph=graph_module.graph,
                             op_target=dq_op,
                         )
-                        dq_node.args = (matmul_input_node, *input_node_qargs)
-                        matmul_node.replace_input_with(matmul_input_node, dq_node)
+                        dq_node.args = (node, *input_node_qargs)
+                        matmul_node.replace_input_with(node, dq_node)
 
             partition_output = list(partition.output_nodes[0].users)[0]
             quantized_output = partition_output.target == q_op
             if quantized_output:
-                output_node_qargs = partition_output.args[1:]
+                output_node_qargs = QuantArgs.from_operator(
+                    partition_output.target, partition_output.args
+                )
                 with graph_module.graph.inserting_after(matmul_node):
                     # Create q-node after matmul
                     q_node = create_node(
diff --git a/backends/arm/_passes/arm_pass_manager.py b/backends/arm/_passes/arm_pass_manager.py
index 9bac3b037c..686bfbcd8a 100644
--- a/backends/arm/_passes/arm_pass_manager.py
+++ b/backends/arm/_passes/arm_pass_manager.py
@@ -21,26 +21,32 @@
 from executorch.backends.arm._passes.convert_split_to_slice import (
     ConvertSplitToSlicePass,
 )
-from executorch.backends.arm._passes.convert_squeezes_to_view import (
+from executorch.backends.arm._passes.convert_squeezes_to_view import (  # type: ignore[import-not-found]
     ConvertSqueezesToViewPass,
 )
+from executorch.backends.arm._passes.decompose_batchnorm_pass import (
+    DecomposeBatchNormPass,
+)
 from executorch.backends.arm._passes.decompose_div_pass import DecomposeDivPass
 from executorch.backends.arm._passes.decompose_layernorm_pass import (
     DecomposeLayerNormPass,
 )
 from executorch.backends.arm._passes.decompose_linear_pass import DecomposeLinearPass
 from executorch.backends.arm._passes.decompose_meandim_pass import DecomposeMeanDimPass
-from executorch.backends.arm._passes.decompose_select import DecomposeSelectPass
+from executorch.backends.arm._passes.decompose_select import (  # type: ignore[import-not-found]
+    DecomposeSelectPass,
+)
 from executorch.backends.arm._passes.decompose_softmaxes_pass import (
     DecomposeSoftmaxesPass,
 )
 from executorch.backends.arm._passes.decompose_var_pass import DecomposeVarPass
 from executorch.backends.arm._passes.fold_qdq_with_annotated_qparams_pass import (
     FoldAndAnnotateQParamsPass,
-    QuantizeFullArgument,
+    QuantizeOperatorArguments,
     RetraceFoldedDtypesPass,
 )
-from executorch.backends.arm._passes.fuse_quantized_activation_pass import (
+from executorch.backends.arm._passes.fuse_batchnorm2d_pass import FuseBatchnorm2DPass
+from executorch.backends.arm._passes.fuse_quantized_activation_pass import (  # type: ignore[import-not-found]
     FuseQuantizedActivationPass,
 )
 from executorch.backends.arm._passes.insert_table_ops import InsertTableOpsPass
@@ -48,10 +54,12 @@
     KeepDimsFalseToSqueezePass,
 )
 from executorch.backends.arm._passes.match_arg_ranks_pass import MatchArgRanksPass
-from executorch.backends.arm._passes.meandim_to_averagepool_pass import (
+from executorch.backends.arm._passes.meandim_to_averagepool_pass import (  # type: ignore[attr-defined]
     ConvertMeanDimToAveragePoolPass,
 )
-from executorch.backends.arm._passes.mm_to_bmm_pass import ConvertMmToBmmPass
+from executorch.backends.arm._passes.mm_to_bmm_pass import (  # type: ignore[import-not-found]
+    ConvertMmToBmmPass,
+)
 from executorch.backends.arm._passes.remove_clone_pass import RemoveClonePass
 from executorch.backends.arm._passes.scalars_to_attribute_pass import (
     ScalarsToAttributePass,
@@ -82,14 +90,15 @@ def _transform(self, graph_module: GraphModule):
     def _tosa_080_BI_pipeline(self, exported_program: ExportedProgram) -> GraphModule:
         self.add_pass(FuseQuantizedActivationPass())
         self.add_pass(RemoveGetItemPass())
+        self.add_pass(DecomposeBatchNormPass())
         self.add_pass(ConvertSplitToSlicePass())
         self.add_pass(ConvertMmToBmmPass())
         self.add_pass(DecomposeLinearPass())
         self.add_pass(ConvertMeanDimToAveragePoolPass())
 
         self.add_pass(AnnotateDecomposedMatmulPass())
-        self.add_pass(QuantizeFullArgument())
-        self.add_pass(FoldAndAnnotateQParamsPass())
+        self.add_pass(QuantizeOperatorArguments())
+        self.add_pass(FoldAndAnnotateQParamsPass())  # type: ignore[call-arg]
         self.add_pass(RetraceFoldedDtypesPass())
         self.add_pass(InsertTableOpsPass(exported_program))
 
@@ -116,16 +125,18 @@ def _tosa_080_MI_pipeline(self, exported_program: ExportedProgram) -> GraphModul
         self.add_pass(ConvertSplitToSlicePass())
         self.add_pass(ConvertMmToBmmPass())
         self.add_pass(DecomposeLinearPass())
+        self.add_pass(DecomposeBatchNormPass())
         self.add_pass(DecomposeLayerNormPass())
         self.add_pass(DecomposeVarPass())
         self.add_pass(DecomposeMeanDimPass())
         self.add_pass(ConvertMeanDimToAveragePoolPass())
         self.add_pass(DecomposeDivPass())
         self.add_pass(DecomposeSoftmaxesPass())
+        self.add_pass(FuseBatchnorm2DPass(exported_program))
 
         self.add_pass(AnnotateDecomposedMatmulPass())
-        self.add_pass(QuantizeFullArgument())
-        self.add_pass(FoldAndAnnotateQParamsPass())
+        self.add_pass(QuantizeOperatorArguments())
+        self.add_pass(FoldAndAnnotateQParamsPass())  # type: ignore[call-arg]
         self.add_pass(RetraceFoldedDtypesPass())
         self.add_pass(InsertTableOpsPass(exported_program))
 
diff --git a/backends/arm/_passes/arm_pass_utils.py b/backends/arm/_passes/arm_pass_utils.py
index 7377d401ab..cb43acc7fd 100644
--- a/backends/arm/_passes/arm_pass_utils.py
+++ b/backends/arm/_passes/arm_pass_utils.py
@@ -1,5 +1,5 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
-# Copyright 2024 Arm Limited and/or its affiliates.
+# Copyright 2024-2025 Arm Limited and/or its affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the BSD-style license found in the
@@ -58,9 +58,9 @@ def get_param_tensor(
     elif is_get_attr_node(node):
         # This is a hack to support both lifted and unlifted graph
         try:
-            return getattr(node.graph.owning_module, node.target)
+            return getattr(node.graph.owning_module, node.target)  # type: ignore[arg-type]
         except AttributeError:
-            return getattr(exp_prog.graph_module, node.target)
+            return getattr(exp_prog.graph_module, node.target)  # type: ignore[arg-type]
     raise RuntimeError(f"unsupported param type, {node.op}.")
 
 
@@ -156,7 +156,7 @@ def get_node_arg(args: list | dict, key: int | str | type, default_value=None):
                 f"Out of bounds index {key} for getting value in args (of size {len(args)})"
             )
     elif isinstance(key, str):
-        return args.get(key, default_value)  # pyre-ignore[16]
+        return args.get(key, default_value)  # type: ignore[union-attr]  # pyre-ignore[16]
     elif isclass(key):
         for arg in args:
             if isinstance(arg, key):
diff --git a/backends/arm/_passes/decompose_batchnorm_pass.py b/backends/arm/_passes/decompose_batchnorm_pass.py
new file mode 100644
index 0000000000..d33e8e3b51
--- /dev/null
+++ b/backends/arm/_passes/decompose_batchnorm_pass.py
@@ -0,0 +1,138 @@
+# Copyright 2025 Arm Limited and/or its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# pyre-unsafe
+
+import operator
+
+import torch
+from executorch.backends.arm._passes.arm_pass_utils import create_node
+from executorch.exir.dialects._ops import ops as exir_ops
+from executorch.exir.pass_base import ExportPass, PassResult
+
+
+edge_bn_ops = (exir_ops.edge.aten._native_batch_norm_legit_no_training.default,)
+
+
+def get_bn_decomposition(op) -> tuple:
+    """
+    Returns decomposition of batchnorm in edge ops.
+    Raises RuntimeError if op is not batchnorm edge op.
+    """
+    if op in edge_bn_ops:
+        return (
+            exir_ops.edge.aten.sub.Tensor,
+            exir_ops.edge.aten.add.Tensor,
+            exir_ops.edge.aten.rsqrt.default,
+            exir_ops.edge.aten.mul.Tensor,
+            exir_ops.edge.aten.view_copy.default,
+            exir_ops.edge.aten.full.default,
+        )
+    else:
+        raise RuntimeError(f"Can't get decomposition for {op}")
+
+
+class DecomposeBatchNormPass(ExportPass):
+    """
+    Decompose BatchNorm to:
+    %output = (%x - %E[x]) /  SQRT( %Var[x] + %epsilon ) * %gamma + %beta
+    e.g.
+    %output = (%activations - %running_mean) /  SQRT( %running_var + %epsilon_const ) * %weights +  %bias
+    ->
+    %op1 = sub(%activations, %running_mean)
+    %op2 = add(%running_var, %epsilon_const)
+    %op3 = rsqrt(%op2)
+    %op4 = mul(%op1, %op3)
+    %op5 = mul(%op4, %weights)
+    %output = add(%op5, %bias)
+    """
+
+    def call(self, graph_module: torch.fx.GraphModule) -> PassResult:
+        modified = False
+        for node in graph_module.graph.nodes:
+            if node.op != "call_function" or node.target not in edge_bn_ops:
+                continue
+
+            args = node.args
+            meta = node.meta
+            (
+                activations,
+                weights,
+                bias,
+                running_mean,
+                running_var,
+                momentum,
+                epsilon,
+            ) = args
+            if momentum != 0.1:
+                raise RuntimeError(f"Expected momenttum=0.1 but got {momentum}")
+
+            shape = meta["val"][0].size()
+            dtype = meta["val"][0].dtype
+            rank = len(shape)
+            running_mean_shape = running_mean.meta["val"].shape
+            running_mean_reshaped_shape = [1] * rank
+            running_mean_reshaped_shape[1] = running_mean_shape[0]
+            epsilon_reshaped_shape = [1] * rank
+
+            sub, add, rsqrt, mul, view, full = get_bn_decomposition(node.target)
+            with graph_module.graph.inserting_before(node):
+                mean_reshaped = create_node(
+                    graph_module.graph,
+                    view,
+                    args=(running_mean, running_mean_reshaped_shape),
+                )
+                op1 = create_node(
+                    graph_module.graph, sub, args=(activations, mean_reshaped)
+                )
+                full = create_node(
+                    graph_module.graph,
+                    full,
+                    args=(epsilon_reshaped_shape, epsilon),
+                    kwargs={"dtype": dtype},
+                )
+                var_reshaped = create_node(
+                    graph_module.graph,
+                    view,
+                    args=(running_var, running_mean_reshaped_shape),
+                )
+                op2 = create_node(graph_module.graph, add, args=(var_reshaped, full))
+                op3 = create_node(graph_module.graph, rsqrt, args=(op2,))
+                op4 = create_node(graph_module.graph, mul, args=(op1, op3))
+                if weights is not None:
+                    weights_reshaped = create_node(
+                        graph_module.graph,
+                        view,
+                        args=(weights, running_mean_reshaped_shape),
+                    )
+                    op5 = create_node(
+                        graph_module.graph, mul, args=(op4, weights_reshaped)
+                    )
+                else:
+                    op5 = op4
+                output = op5
+                if bias is not None:
+                    bias_reshaped_shape = running_mean_reshaped_shape
+                    bias_reshaped = create_node(
+                        graph_module.graph, view, args=(bias, bias_reshaped_shape)
+                    )
+                    output = create_node(
+                        graph_module.graph, add, args=(op5, bias_reshaped)
+                    )
+
+                users = [user for user in node.users if node != user]
+                node.replace_all_uses_with(output)
+                for user in users:
+                    if user.target == operator.getitem:
+                        user.replace_all_uses_with(output)
+                graph_module.graph.erase_node(node)
+                graph_module.graph.eliminate_dead_code()
+            modified = True
+        if modified:
+            graph_module.recompile()
+            graph_module = super().call(graph_module).graph_module
+
+        return PassResult(graph_module, modified)
diff --git a/backends/arm/_passes/decompose_layernorm_pass.py b/backends/arm/_passes/decompose_layernorm_pass.py
index 3739337101..cc4a81caae 100644
--- a/backends/arm/_passes/decompose_layernorm_pass.py
+++ b/backends/arm/_passes/decompose_layernorm_pass.py
@@ -1,4 +1,4 @@
-# Copyright 2024 Arm Limited and/or its affiliates.
+# Copyright 2024-2025 Arm Limited and/or its affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the BSD-style license found in the
@@ -82,9 +82,10 @@ def call(self, graph_module: torch.fx.GraphModule):
             n_dims = len(normalized_shape)
             if isinstance(meta["val"], tuple):
                 shape = meta["val"][0].size()
+                dtype = meta["val"][0].dtype
             else:
                 shape = meta["val"].size()
-            dtype = meta["val"][0].dtype
+                dtype = meta["val"].dtype
             rank = len(shape)
             dims = list(range(-1, -1 * (n_dims + 1), -1))
             dims = [dim % rank for dim in dims]
diff --git a/backends/arm/_passes/fold_qdq_with_annotated_qparams_pass.py b/backends/arm/_passes/fold_qdq_with_annotated_qparams_pass.py
index b1e680b7bc..29791940d5 100644
--- a/backends/arm/_passes/fold_qdq_with_annotated_qparams_pass.py
+++ b/backends/arm/_passes/fold_qdq_with_annotated_qparams_pass.py
@@ -105,21 +105,6 @@ def fold_and_annotate_arg(
         for arg in arg_list:
             if not isinstance(arg, Node):
                 return
-            """
-             Make sure arg has requires_grad set to False
-             For parameters that are not quantized, sometimes (i.e. convolution)
-             the Parameter(FakeTensor(...)) has requires_grad set to True, which
-             causes the retracing of the graph to fail with:
-
-             E       RuntimeError: isDifferentiableType(variable.scalar_type()) INTERNAL ASSERT FAILED at "/Users/runner/work/pytorch/pytorch/pytorch/torch/csrc/autograd/functions/utils.h":74, please report a bug to PyTorch.
-             E
-             E       While executing %aten_convolution_default : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.convolution.default](args = (%quantized_decomposed_quantize_per_tensor_default, %b__frozen_param0, %p__param_constant1, [1, 1], [0, 0], [1, 1], False, [0, 0], 1), kwargs = {})
-             E       Original traceback:
-             E         File "/Users/perast01/src/executorch/backends/arm/test/ops/test_conv2d.py", line 110, in forward
-             E           x = conv(x)
-            """
-            if arg.op == "placeholder":
-                arg.meta["val"].requires_grad = False
 
             arg_quant_params = None
             if arg.target == dq_op:
@@ -134,7 +119,7 @@ def fold_and_annotate_arg(
             node.meta["input_qparams"][i] = input_qparams
             for n in nodes_to_remove:
                 assert n.target == dq_op
-                n.replace_all_uses_with(n.args[0])
+                n.replace_all_uses_with(n.args[0])  # type: ignore[arg-type]
                 graph_module.graph.erase_node(n)
 
     def call(self, graph_module: GraphModule) -> PassResult:
@@ -182,11 +167,14 @@ def call(self, graph_module: GraphModule) -> PassResult:
         return PassResult(graph_module, True)
 
 
-class QuantizeFullArgument(ExportPass):
+class QuantizeOperatorArguments(ExportPass):
     """
-    Make sure the fill_value for full.default is quantized. This pass needs to be run before
-    the folding pass above to make sure that the retraced output of the full.default op is
-    the right dtype.
+    This pass makes sure that the arguments to full.default and clamp.default are quantized correctly.
+    More specifically, this pass:
+        - Makes sure the fill_value for full.default is quantized. This pass needs to be run before
+        the folding pass above to make sure that the retraced output of the full.default op is
+        the right dtype.
+        - Makes sure the min and max values to clamp.default are quantized, if it's a quantized operator.
     """
 
     def call(self, graph_module: GraphModule) -> PassResult:
@@ -194,7 +182,10 @@ def call(self, graph_module: GraphModule) -> PassResult:
         # Loop over the graph nodes and find full.default nodes.
         for n in graph_module.graph.nodes:
             n = cast(Node, n)
-            if n.target != exir_ops.edge.aten.full.default:
+            if n.target not in {
+                exir_ops.edge.aten.clamp.default,
+                exir_ops.edge.aten.full.default,
+            }:
                 continue
 
             # Make sure we have a quantized operator
@@ -203,13 +194,29 @@ def call(self, graph_module: GraphModule) -> PassResult:
                 continue
 
             qargs = QuantArgs.from_operator(user.target, user.args)
-            if "dtype" not in n.kwargs.keys() or n.kwargs["dtype"] != qargs.dtype:
-                # replace the node arg with a quantized dito and also set dtype
-                # to get the right output according to the Edge IR specification:
-                # exir/dialects/edge/edge.yaml:3596
-                quantized_full_value = qargs.quantize_value(n.args[1]).item()
-                n.update_arg(1, quantized_full_value)
-                n.update_kwarg("dtype", qargs.dtype)
+
+            if n.target == exir_ops.edge.aten.full.default:
+                if "dtype" not in n.kwargs.keys() or n.kwargs["dtype"] != qargs.dtype:
+                    # replace the node arg with a quantized dito and also set dtype
+                    # to get the right output according to the Edge IR specification:
+                    # exir/dialects/edge/edge.yaml:3596
+                    quantized_full_value = qargs.quantize_value(n.args[1]).item()
+                    n.update_arg(1, quantized_full_value)
+                    n.update_kwarg("dtype", qargs.dtype)
+                    modified = True
+            elif n.target == exir_ops.edge.aten.clamp.default:
+                # Quantize the min and max arguments of clamp, if they are not None
+                min_val = n.args[1]
+                max_val = None if len(n.args) <= 2 else n.args[2]
+
+                if min_val is not None:
+                    quantized_min_val = qargs.quantize_value(min_val).item()
+                    n.update_arg(1, quantized_min_val)
+
+                if max_val is not None:
+                    quantized_max_val = qargs.quantize_value(max_val).item()
+                    n.update_arg(2, quantized_max_val)
+
                 modified = True
 
         return PassResult(graph_module, modified)
diff --git a/backends/arm/_passes/fuse_batchnorm2d_pass.py b/backends/arm/_passes/fuse_batchnorm2d_pass.py
new file mode 100644
index 0000000000..6a5ece2e44
--- /dev/null
+++ b/backends/arm/_passes/fuse_batchnorm2d_pass.py
@@ -0,0 +1,128 @@
+# Copyright 2025 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+from executorch.exir import ExportedProgram
+from executorch.exir.dialects._ops import ops as exir_ops
+from executorch.exir.pass_base import ExportPass, PassResult
+from torch._export.utils import get_buffer, get_param
+from torch.fx import Node
+from torch.nn.utils.fusion import fuse_conv_bn_weights
+
+
+class FuseBatchnorm2DPass(ExportPass):
+    """Fuses the pattern convolution -> batchnorm by updating
+    the weights and bias of the convolution and removing the batchnorm.
+    """
+
+    def __init__(self, exported_program: ExportedProgram):
+        self.exported_program = exported_program
+        super().__init__()
+
+    def is_fuseable_conv_bn(self, node: Node):
+        """Returns True if node is a batchnorm that can be fused into
+        a parent convolution."""
+        if node.op != "call_function":
+            return False
+        if node.target not in (
+            exir_ops.edge.aten._native_batch_norm_legit,
+            exir_ops.edge.aten._native_batch_norm_legit_no_training.default,
+        ):
+            return False
+        conv = node.all_input_nodes[0]
+        if conv.target != exir_ops.edge.aten.convolution.default:
+            return False
+        # Batchnorm users are getitem, we can only handle those that get first element.
+        for user in node.users:
+            get_index = user.args[1]
+            if get_index != 0:
+                return False
+        # Since we change the output of the conv, fuse only if it has single user.
+        if len(conv.users) > 1:
+            return False
+        # For similar reasons, only fuse if conv parameters have single user.
+        if len(conv.all_input_nodes[1].users) > 1:
+            return False
+        if len(conv.all_input_nodes) > 2 and len(conv.all_input_nodes[2].users) > 1:
+            return False
+        return True
+
+    def call(self, graph_module: torch.fx.GraphModule) -> PassResult:  # noqa: C901
+        modified = False
+        for node in graph_module.graph.nodes:
+            if not self.is_fuseable_conv_bn(node):
+                continue
+
+            def get_param_or_none(arg) -> torch.nn.Parameter | None:
+                """get_param but check if arg is none first."""
+                return (
+                    get_param(self.exported_program, arg) if arg is not None else None
+                )
+
+            # Get weight, bias, mean, var and epsilon from the batchnorm
+            bn = node
+            conv, bn_weight_node, bn_bias_node, bn_mean_node, bn_var_node = bn.args[0:5]
+            bn_weight = get_param_or_none(bn_weight_node)
+            bn_bias = get_param_or_none(bn_bias_node)
+
+            running_mean = get_buffer(self.exported_program, bn_mean_node)
+            running_var = get_buffer(self.exported_program, bn_var_node)
+            if running_mean is None or running_var is None:
+                raise ValueError(
+                    "Parameters running_mean and running_var of batchnorm can't be None."
+                )
+            epsilon = bn.args[-1]
+
+            # Get weight and bias from conv
+            conv_weight_node, conv_bias_node = conv.args[1:3]
+            conv_weight = get_param(self.exported_program, conv_weight_node)
+            conv_bias = get_param_or_none(conv_bias_node)
+            if conv_weight is None:
+                raise ValueError("Parameter weight of convolution can't be None.")
+
+            # Compute conv parameters folded with batchnorm
+            fused_conv_weight, fused_conv_bias = fuse_conv_bn_weights(
+                conv_weight,
+                conv_bias,
+                running_mean,
+                running_var,
+                epsilon,
+                bn_weight,
+                bn_bias,
+            )
+
+            # Set the conv parameters to fused value
+            def try_set_param(
+                param_node: Node | None, param_value: torch.nn.Parameter
+            ) -> bool:
+                """set_param but check if param_node is None first. Return True if param was set successfully, otherwise False."""
+                if param_node is not None:
+                    param_name = (
+                        self.exported_program.graph_signature.inputs_to_parameters[
+                            param_node.name
+                        ]
+                    )
+                    self.exported_program.state_dict[param_name] = param_value
+                    return True
+                return False
+
+            try_set_param(conv_weight_node, fused_conv_weight)
+            if not try_set_param(conv_bias_node, fused_conv_bias) and try_set_param(
+                bn_bias_node, fused_conv_bias
+            ):
+                # Conv didn't have bias but batchnorm did, steal bias from batchnorm.
+                conv_args = (*conv.args[0:2], bn_bias_node, *conv.args[3:])
+                conv.args = conv_args
+
+            # Erasing nodes is handled by dead-code elimination.
+            for user in bn.users:
+                user.replace_all_uses_with(conv)
+            modified = True
+
+        if modified:
+            graph_module.graph.eliminate_dead_code()
+            graph_module.recompile()
+            graph_module = super().call(graph_module).graph_module
+        return PassResult(graph_module=graph_module, modified=modified)
diff --git a/backends/arm/_passes/insert_table_ops.py b/backends/arm/_passes/insert_table_ops.py
index 57a8376d40..b500540ffb 100644
--- a/backends/arm/_passes/insert_table_ops.py
+++ b/backends/arm/_passes/insert_table_ops.py
@@ -1,4 +1,4 @@
-# Copyright 2024 Arm Limited and/or its affiliates.
+# Copyright 2024-2025 Arm Limited and/or its affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the BSD-style license found in the
@@ -31,7 +31,7 @@ class InsertTableOpsPass(ExportPass):
     """
     For ops in self.table_ops they need to be serialized as a TOSA TABLE. This pass replaces these
     edge ops with a tosa._table(input: Tensor, target_str: str) where target_str == str(node.target).
-    When loweringthe _table node target_str will be used to find the corresponding torch operator
+    When lowering the _table node target_str will be used to find the corresponding torch operator
     which will be used to produce the table values in operators/op_table.py.
     """
 
@@ -42,6 +42,8 @@ class InsertTableOpsPass(ExportPass):
         exir_ops.edge.aten.rsqrt.default: torch.rsqrt,
         exir_ops.edge.aten.sigmoid.default: torch.sigmoid,
         exir_ops.edge.aten.tanh.default: torch.tanh,
+        exir_ops.edge.aten.hardsigmoid.default: torch.nn.functional.hardsigmoid,
+        exir_ops.edge.aten.hardswish.default: torch.nn.functional.hardswish,
     }
 
     def __init__(self, exported_program: ExportedProgram) -> None:
@@ -92,7 +94,7 @@ def call(self, graph_module: GraphModule) -> PassResult:
             with graph_module.graph.inserting_before(node):
                 table_node = create_node(
                     graph=graph_module.graph,
-                    op_target=torch.ops.tosa._table,
+                    op_target=torch.ops.tosa._table.default,
                     args=(node.args[0],),
                 )
                 assert len(input_qparams) == 1
@@ -104,7 +106,11 @@ def call(self, graph_module: GraphModule) -> PassResult:
                     out_quantargs=output_qparams[0],
                 )
                 # Register buffer in self.exported_program.state_dict
-                self.register_buffer(buffer_name=table_node.name, buffer=buffer)
+                # When the graph is retraced, the implementation _table is used and the suffix _default disappears from the node name
+                # Remove it here to make it possible to find in the node_visitor
+                self.register_buffer(
+                    buffer_name=table_node.name.replace("_default", ""), buffer=buffer
+                )
                 node.replace_all_uses_with(table_node)
             graph_module.graph.erase_node(node)
             table_node.meta["input_qparams"] = input_qparams
diff --git a/backends/arm/_passes/keep_dims_false_to_squeeze_pass.py b/backends/arm/_passes/keep_dims_false_to_squeeze_pass.py
index f4d369a504..ad95379cc8 100644
--- a/backends/arm/_passes/keep_dims_false_to_squeeze_pass.py
+++ b/backends/arm/_passes/keep_dims_false_to_squeeze_pass.py
@@ -1,4 +1,4 @@
-# Copyright 2024 Arm Limited and/or its affiliates.
+# Copyright 2024-2025 Arm Limited and/or its affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the BSD-style license found in the
@@ -66,7 +66,7 @@ def call(self, graph_module: torch.fx.GraphModule):
             sum_node = cast(torch.fx.Node, node)
             keep_dim = get_node_arg(
                 # pyre-ignore[6]
-                sum_node.args,
+                sum_node.args,  # type: ignore[arg-type]
                 keep_dim_index,
                 False,
             )
@@ -74,7 +74,7 @@ def call(self, graph_module: torch.fx.GraphModule):
             if keep_dim:
                 continue
 
-            dim_list = get_node_arg(sum_node.args, 1, [0])  # pyre-ignore[6]
+            dim_list = get_node_arg(sum_node.args, 1, [0])  # type: ignore[arg-type]  # pyre-ignore[6]
 
             # Add keep_dim = True arg to sum node.
             set_node_arg(sum_node, 2, True)
diff --git a/backends/arm/_passes/scalars_to_attribute_pass.py b/backends/arm/_passes/scalars_to_attribute_pass.py
index f6fe02b6eb..78865fe33f 100644
--- a/backends/arm/_passes/scalars_to_attribute_pass.py
+++ b/backends/arm/_passes/scalars_to_attribute_pass.py
@@ -1,4 +1,4 @@
-# Copyright 2024 Arm Limited and/or its affiliates.
+# Copyright 2024-2025 Arm Limited and/or its affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the BSD-style license found in the
@@ -54,7 +54,7 @@ def call(self, graph_module: GraphModule) -> PassResult:
                 if isinstance(arg, int) and not torch.is_floating_point(
                     get_first_fake_tensor(n)
                 ):
-                    new_args.append(arg)
+                    new_args.append(arg)  # type: ignore[arg-type]
                     continue
 
                 prefix = "_tensor_constant_"
diff --git a/backends/arm/arm_backend.py b/backends/arm/arm_backend.py
index d695aec2fd..899bafcf04 100644
--- a/backends/arm/arm_backend.py
+++ b/backends/arm/arm_backend.py
@@ -15,7 +15,7 @@
 import os
 from typing import cast, final, List, Optional
 
-import serializer.tosa_serializer as ts
+import serializer.tosa_serializer as ts  # type: ignore
 from executorch.backends.arm.arm_vela import vela_compile
 from executorch.backends.arm.operators.node_visitor import get_node_visitors
 
@@ -230,7 +230,7 @@ def preprocess(  # noqa: C901
         # Converted output for this subgraph, serializer needs path early as it emits
         # const data directly. Path created and data written only in debug builds.
         tosa_graph = ts.TosaSerializer(artifact_path)
-        graph_module = ArmPassManager(tosa_spec).transform_to_backend_pipeline(
+        graph_module = ArmPassManager(tosa_spec).transform_to_backend_pipeline(  # type: ignore
             exported_program=edge_program
         )
 
diff --git a/backends/arm/arm_partitioner.py b/backends/arm/arm_partitioner.py
index cc4058c4c5..8fde8dff61 100644
--- a/backends/arm/arm_partitioner.py
+++ b/backends/arm/arm_partitioner.py
@@ -10,7 +10,7 @@
 from typing import Callable, final, List, Optional, Tuple
 
 import torch
-from executorch.backends.arm.arm_backend import (
+from executorch.backends.arm.arm_backend import (  # type: ignore[attr-defined]
     ArmBackend,
 )  # usort: skip
 from executorch.backends.arm.operator_support.tosa_supported_operators import (
@@ -113,8 +113,41 @@ def ops_to_not_decompose(
         self,
         ep: ExportedProgram,
     ) -> Tuple[List[torch._ops.OpOverload], Optional[Callable[[torch.fx.Node], bool]]]:
+        ops_to_not_decompose_if_quant_op = [
+            torch.ops.aten.hardsigmoid.default,
+            torch.ops.aten.hardswish.default,
+        ]
+
+        def filter_fn(node: torch.fx.Node) -> bool:
+            # This function filters for operators to not decompose where:
+            #   - It's target is in ops_to_not_decompose_if_quant_op list.
+            #   - All it's inputs/outputs are quantize operators.
+            dq = torch.ops.quantized_decomposed.dequantize_per_tensor.default
+            q = torch.ops.quantized_decomposed.quantize_per_tensor.default
+
+            if node.target in ops_to_not_decompose_if_quant_op:
+                # Assume we should not decompose the operator (it is quantized)
+                should_not_decompose = True
+
+                input_nodes = node.all_input_nodes
+                ouput_nodes = node.users
+
+                for inp in input_nodes:
+                    if inp.target != dq:
+                        should_not_decompose = False
+
+                for out in ouput_nodes:
+                    if out.target != q:
+                        should_not_decompose = False
+
+                return should_not_decompose
+
+            # Be default, do not decompose the operator
+            return True
+
         ops_to_not_decompose = [
             torch.ops.aten.linear.default,
             torch.ops.aten.upsample_nearest2d.vec,
-        ]
-        return (ops_to_not_decompose, None)
+        ] + ops_to_not_decompose_if_quant_op
+
+        return (ops_to_not_decompose, filter_fn)
diff --git a/backends/arm/arm_vela.py b/backends/arm/arm_vela.py
index 918d95ba37..f7f0c4b49c 100644
--- a/backends/arm/arm_vela.py
+++ b/backends/arm/arm_vela.py
@@ -1,4 +1,4 @@
-# Copyright 2023-2024 Arm Limited and/or its affiliates.
+# Copyright 2023-2025 Arm Limited and/or its affiliates.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
@@ -12,7 +12,7 @@
 from typing import List
 
 import numpy as np
-from ethosu.vela import vela
+from ethosu.vela import vela  # type: ignore
 
 
 # Pack either input or output tensor block, compose the related arrays into
@@ -96,13 +96,13 @@ def vela_compile(tosa_graph, args: List[str], shape_order=None):
                 block_name = block_name + b"\x00" * (16 - len(block_name))
 
                 # We need the acual unpadded block lengths for hw setup
-                block_length = struct.pack("<iiii", len(bin_blocks[key]), 0, 0, 0)
+                block_length_bytes = struct.pack("<iiii", len(bin_blocks[key]), 0, 0, 0)
 
                 # Pad block data to multiple of 16 bytes
                 block_data = bin_blocks[key]
                 block_data = block_data + b"\x00" * (15 - (len(block_data) - 1) % 16)
 
-                block = block_name + block_length + block_data
+                block = block_name + block_length_bytes + block_data
                 blocks = blocks + block
 
         return blocks
diff --git a/backends/arm/operator_support/to_copy_support.py b/backends/arm/operator_support/to_copy_support.py
index 768c6a3492..27c5f24f2e 100644
--- a/backends/arm/operator_support/to_copy_support.py
+++ b/backends/arm/operator_support/to_copy_support.py
@@ -97,7 +97,7 @@ def is_node_supported(self, node: fx.Node, tosa_spec: TosaSpecification) -> bool
         if input_dtype not in supported_dtypes:
             logger.info(
                 f"Input dtype {input_val.dtype} is not supported in "
-                f"{node.target.name()}."  # pyre-ignore[16]
+                f"{node.target.name()}."  # type: ignore[union-attr]  # pyre-ignore[16]
             )
             return False
 
@@ -107,7 +107,7 @@ def is_node_supported(self, node: fx.Node, tosa_spec: TosaSpecification) -> bool
         if output_val.dtype not in supported_dtypes[input_dtype]:
             logger.info(
                 f"Output dtype {output_val.dtype} is not supported in "
-                f"{node.target.name()} for input dtype {input_dtype}. "  # pyre-ignore[16]
+                f"{node.target.name()} for input dtype {input_dtype}. "  # type: ignore[union-attr]  # pyre-ignore[16]
                 f"Supported output types: "
                 f"{''.join(str(t) for t in supported_dtypes[input_dtype])}"
             )
@@ -118,7 +118,7 @@ def is_node_supported(self, node: fx.Node, tosa_spec: TosaSpecification) -> bool
             if node.kwargs["memory_format"] in (torch.preserve_format,):
                 logger.info(
                     f"Argument 'memory_format' is not supported for "
-                    f"{node.target.name()} right now."  # pyre-ignore[16]
+                    f"{node.target.name()} right now."  # type: ignore[union-attr]  # pyre-ignore[16]
                 )
                 return False
 
@@ -126,10 +126,10 @@ def is_node_supported(self, node: fx.Node, tosa_spec: TosaSpecification) -> bool
         if "dim_order" in node.kwargs:
             dim_order = node.kwargs["dim_order"]
             # pyre-ignore[6]
-            if dim_order != list(range(len(dim_order))):
+            if dim_order != list(range(len(dim_order))):  # type: ignore[arg-type]
                 logger.info(
                     f"Argument {dim_order=} is not supported for "
-                    f"{node.target.name()} right now."  # pyre-ignore[16]
+                    f"{node.target.name()} right now."  # type: ignore[union-attr]  # pyre-ignore[16]
                 )
                 return False
 
diff --git a/backends/arm/operator_support/tosa_supported_operators.py b/backends/arm/operator_support/tosa_supported_operators.py
index c3102a86a4..36914579fe 100644
--- a/backends/arm/operator_support/tosa_supported_operators.py
+++ b/backends/arm/operator_support/tosa_supported_operators.py
@@ -76,9 +76,12 @@ def is_node_supported(self, submodules, node: fx.Node) -> bool:
             exir_ops.edge.aten.add.Tensor,
             exir_ops.edge.aten.expand_copy.default,
             exir_ops.edge.aten.cat.default,
+            exir_ops.edge.aten.clamp.default,
             exir_ops.edge.aten.bmm.default,
             exir_ops.edge.aten.permute_copy.default,
+            exir_ops.edge.aten.hardsigmoid.default,
             exir_ops.edge.aten.hardtanh.default,
+            exir_ops.edge.aten.hardswish.default,
             exir_ops.edge.aten.convolution.default,
             exir_ops.edge.aten.div.Tensor,
             exir_ops.edge.aten.eq.Tensor,
@@ -137,5 +140,5 @@ def is_node_supported(self, submodules, node: fx.Node) -> bool:
     def is_node_supported_custom(self, node: fx.Node) -> bool:
         tosa_checks = get_registered_tosa_support_checks(self.tosa_spec)
         if node.target in tosa_checks.keys():
-            return tosa_checks[node.target].is_node_supported(node, self.tosa_spec)
+            return tosa_checks[node.target].is_node_supported(node, self.tosa_spec)  # type: ignore[index]
         return False
diff --git a/backends/arm/operators/__init__.py b/backends/arm/operators/__init__.py
index 5a97d33304..f57ba092bc 100644
--- a/backends/arm/operators/__init__.py
+++ b/backends/arm/operators/__init__.py
@@ -9,9 +9,9 @@
     node_visitor,
     op_add,
     op_avg_pool2d,
-    op_batch_norm,
     op_bmm,
     op_cat,
+    op_clamp,
     op_conv2d,
     op_eq,
     op_exp,
diff --git a/backends/arm/operators/node_visitor.py b/backends/arm/operators/node_visitor.py
index 8609e5e391..afb5f93baa 100644
--- a/backends/arm/operators/node_visitor.py
+++ b/backends/arm/operators/node_visitor.py
@@ -1,4 +1,4 @@
-# Copyright 2023-2024 Arm Limited and/or its affiliates.
+# Copyright 2023-2025 Arm Limited and/or its affiliates.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
@@ -7,7 +7,7 @@
 
 from typing import Dict, List
 
-import serializer.tosa_serializer as ts
+import serializer.tosa_serializer as ts  # type: ignore
 import torch
 from executorch.backends.arm.tosa_mapping import TosaArg
 from executorch.backends.arm.tosa_specification import TosaSpecification
@@ -44,7 +44,7 @@ def define_node(
 
 
 # container for all node visitors
-_node_visitor_dicts = {
+_node_visitor_dicts: Dict[TosaSpecification, Dict] = {
     TosaSpecification.create_from_string("TOSA-0.80+BI"): {},
     TosaSpecification.create_from_string("TOSA-0.80+MI"): {},
 }
diff --git a/backends/arm/operators/op_add.py b/backends/arm/operators/op_add.py
index 74f00354ed..ccdeb2c1bc 100644
--- a/backends/arm/operators/op_add.py
+++ b/backends/arm/operators/op_add.py
@@ -1,4 +1,4 @@
-# Copyright 2023-2024 Arm Limited and/or its affiliates.
+# Copyright 2023-2025 Arm Limited and/or its affiliates.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
@@ -10,7 +10,7 @@
 import executorch.backends.arm.tosa_quant_utils as tqutils
 import executorch.backends.arm.tosa_utils as tutils
 
-import serializer.tosa_serializer as ts
+import serializer.tosa_serializer as ts  # type: ignore
 from executorch.backends.arm.operators.node_visitor import (
     NodeVisitor,
     register_node_visitor,
@@ -75,7 +75,7 @@ def define_node(
         if output.dtype == ts.DType.INT8:
             # Scale output back to 8 bit
             # pyre-ignore
-            tqutils.insert_rescale_op_to_int8(tosa_graph, add_output, scale_back, node)
+            tqutils.insert_rescale_op_to_int8(tosa_graph, add_output, scale_back, node)  # type: ignore[possibly-undefined]
 
 
 @register_node_visitor
diff --git a/backends/arm/operators/op_avg_pool2d.py b/backends/arm/operators/op_avg_pool2d.py
index fecddac659..e300b3ed01 100644
--- a/backends/arm/operators/op_avg_pool2d.py
+++ b/backends/arm/operators/op_avg_pool2d.py
@@ -1,4 +1,4 @@
-# Copyright 2023-2024 Arm Limited and/or its affiliates.
+# Copyright 2023-2025 Arm Limited and/or its affiliates.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
@@ -6,7 +6,7 @@
 # pyre-unsafe
 from typing import List
 
-import serializer.tosa_serializer as ts
+import serializer.tosa_serializer as ts  # type: ignore
 import torch
 
 # pyre-fixme[21]: ' Could not find a module corresponding to import `executorch.backends.arm._passes.fold_qdq_with_annotated_qparams_pass`
diff --git a/backends/arm/operators/op_batch_norm.py b/backends/arm/operators/op_batch_norm.py
deleted file mode 100644
index ce5998cb72..0000000000
--- a/backends/arm/operators/op_batch_norm.py
+++ /dev/null
@@ -1,211 +0,0 @@
-# Copyright 2023-2024 Arm Limited and/or its affiliates.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-# pyre-unsafe
-from typing import List
-
-import serializer.tosa_serializer as ts
-import torch
-from executorch.backends.arm.operators.node_visitor import (
-    NodeVisitor,
-    register_node_visitor,
-)
-from executorch.backends.arm.tosa_mapping import TosaArg
-from executorch.backends.arm.tosa_specification import TosaSpecification
-from executorch.backends.arm.tosa_utils import promote_shape, tosa_shape
-from serializer.tosa_serializer import TosaOp
-
-
-@register_node_visitor
-class BatchNormVisitor(NodeVisitor):
-    target = "aten._native_batch_norm_legit_no_training.default"
-
-    tosa_specs = [
-        TosaSpecification.create_from_string("TOSA-0.80+MI"),
-    ]
-
-    def __init__(self, *args):
-        super().__init__(*args)
-
-    # For BatchNorm2D, mean and var are calculated over the channel dimension
-    # But TOSA doesn't allow subtraction of inputs with different ranks
-    # Need to augment the shapes to match the ranks with activations
-    def augment_shape_rank(self, shape, dim_order):
-        nchw_shape = (1, *shape, 1, 1)
-        return tosa_shape(nchw_shape, dim_order)
-
-    def define_node(
-        self,
-        node: torch.fx.Node,
-        tosa_graph: ts.TosaSerializer,
-        inputs: List[TosaArg],
-        output: TosaArg,
-    ) -> None:
-        # Decompose batch norm into sequence
-        (activations, weights, bias, running_mean, running_var, momentum, epsilon) = (
-            inputs
-        )
-
-        input_dtype = activations.dtype
-
-        assert (
-            0.1 == momentum.number
-        ), "Expected 0.1 momentum, not currently encoded into TOSA"
-
-        # %output = (%x - %E[x]) /  SQRT( %Var[x] + %epsilon ) * %gamma + %beta
-        # e.g.
-        # %output = (%activations - %running_mean) /  SQRT( %running_var + %epsilon_const ) * %weights +  %bias
-        # ->
-        # %op1 = tosa.SUB(%activations, %running_mean)
-        # %op2 = tosa.ADD(%running_var, %epsilon_const)
-        # %op3 = tosa.RSQRT(%op2)
-        # %op4 = tosa.MUL(%op1, %op3)
-        # %op5 = tosa.MUL(%op4, %weights)
-        # %output = tosa.ADD(%op5, %bias)
-
-        # Reshape mean to match rank of activations
-        mean_reshaped = promote_shape(
-            tosa_graph,
-            running_mean,
-            self.augment_shape_rank(running_mean.shape, output.dim_order),
-            input_dtype,
-        )
-
-        # Subtract mean
-        # %op1 = tosa.SUB(%activations, %running_mean)
-        op1 = tosa_graph.addIntermediate(
-            tosa_shape(output.shape, output.dim_order), input_dtype
-        )
-        tosa_graph.addOperator(
-            TosaOp.Op().SUB,
-            [activations.name, mean_reshaped.name],
-            [op1.name],
-        )
-        # Adding eplison to variance
-        # %op2 = tosa.ADD(%running_var, %epsilon_const)
-        epsilon_const = tosa_graph.addConst([1], input_dtype, [epsilon.number])
-        op2 = tosa_graph.addIntermediate(
-            tosa_shape(running_var.shape, running_var.dim_order), input_dtype
-        )
-        tosa_graph.addOperator(
-            TosaOp.Op().ADD,
-            [running_var.name, epsilon_const.name],
-            [op2.name],
-        )
-        # Push downward the variance
-        # %op3 = tosa.RSQRT(%op2)
-        op3 = tosa_graph.addIntermediate(running_var.shape, input_dtype)
-        tosa_graph.addOperator(TosaOp.Op().RSQRT, [op2.name], [op3.name])
-
-        # Reshape variable to match rank of activations
-        op3_reshaped = promote_shape(
-            tosa_graph,
-            op3,
-            self.augment_shape_rank(running_var.shape, output.dim_order),
-            input_dtype,
-        )
-
-        # Handle non existing weights and bias
-        if not weights.name and not bias.name:
-            # Multiply shifted activations with reciprocal variance
-            # %output = tosa.MUL(%op1, %op3)  e.g. Now we have %output = (%activations - %running_mean) /  SQRT( %running_var + %epsilon_const )
-            attr_mul = ts.TosaSerializerAttribute()
-            attr_mul.MulAttribute(0)
-            tosa_graph.addOperator(
-                TosaOp.Op().MUL, [op1.name, op3_reshaped.name], [output.name], attr_mul
-            )
-            return
-        else:
-            # Multiply shifted activations with reciprocal variance
-            # %op4 = tosa.MUL(%op1, %op3)
-            op4 = tosa_graph.addIntermediate(
-                tosa_shape(output.shape, output.dim_order), input_dtype
-            )
-            attr_mul = ts.TosaSerializerAttribute()
-            attr_mul.MulAttribute(0)
-            tosa_graph.addOperator(
-                TosaOp.Op().MUL, [op1.name, op3_reshaped.name], [op4.name], attr_mul
-            )
-
-        # Now we have %op4 = (%activations - %running_mean) /  SQRT( %running_var + %epsilon_const )
-
-        if weights.name and not bias.name:
-            # Handle only weights but no bias
-
-            # Reshape weights to match rank of activations
-            weights_reshaped = promote_shape(
-                tosa_graph,
-                weights,
-                self.augment_shape_rank(weights.shape, output.dim_order),
-                input_dtype,
-            )
-
-            # %output = tosa.MUL(%op4, %weights)
-            attr_mul = ts.TosaSerializerAttribute()
-            attr_mul.MulAttribute(0)
-            tosa_graph.addOperator(
-                TosaOp.Op().MUL,
-                [op4.name, weights_reshaped.name],
-                [output.name],
-                attr_mul,
-            )
-            return
-
-        if not weights.name and bias.name:
-            # Handle only bias but no weights
-
-            # Reshape bias to match rank of activations
-            bias_reshaped = promote_shape(
-                tosa_graph,
-                bias,
-                self.augment_shape_rank(bias.shape, output.dim_order),
-                input_dtype,
-            )
-
-            # %output = tosa.ADD(%op4, %bias)
-            tosa_graph.addOperator(
-                TosaOp.Op().ADD,
-                [op4.name, bias_reshaped.name],
-                [output.name],
-            )
-            return
-
-        # We have both weights and bias
-
-        # Reshape weights to match rank of activations
-        weights_reshaped = promote_shape(
-            tosa_graph,
-            weights,
-            self.augment_shape_rank(weights.shape, output.dim_order),
-            input_dtype,
-        )
-
-        # %op5 = tosa.MUL(%op4, %weights)
-        op5 = tosa_graph.addIntermediate(
-            tosa_shape(output.shape, output.dim_order), input_dtype
-        )
-        attr_mul = ts.TosaSerializerAttribute()
-        attr_mul.MulAttribute(0)
-        tosa_graph.addOperator(
-            TosaOp.Op().MUL,
-            [op4.name, weights_reshaped.name],
-            [op5.name],
-            attr_mul,
-        )
-
-        # Reshape bias to match rank of activations
-        bias_reshaped = promote_shape(
-            tosa_graph,
-            bias,
-            self.augment_shape_rank(bias.shape, output.dim_order),
-            input_dtype,
-        )
-
-        # %output = tosa.ADD(%op5, %bias)
-        tosa_graph.addOperator(
-            TosaOp.Op().ADD,
-            [op5.name, bias_reshaped.name],
-            [output.name],
-        )
diff --git a/backends/arm/operators/op_bmm.py b/backends/arm/operators/op_bmm.py
index 83d3df2701..d3261ebde0 100644
--- a/backends/arm/operators/op_bmm.py
+++ b/backends/arm/operators/op_bmm.py
@@ -1,4 +1,4 @@
-# Copyright 2024 Arm Limited and/or its affiliates.
+# Copyright 2024-2025 Arm Limited and/or its affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the BSD-style license found in the
@@ -7,7 +7,7 @@
 # pyre-unsafe
 from typing import List
 
-import serializer.tosa_serializer as ts
+import serializer.tosa_serializer as ts  # type: ignore
 import torch
 
 # pyre-fixme[21]: 'Could not find a module corresponding to import `executorch.backends.arm._passes.fold_qdq_with_annotated_qparams_pass`.'
@@ -75,14 +75,14 @@ def define_node(
         if output.dtype == ts.DType.INT8:
             output_qparams = get_output_qparams(node)[0]  # pyre-ignore[16]
             final_output_scale = (
-                input_qparams[0].scale * input_qparams[1].scale  # pyre-ignore[61]
+                input_qparams[0].scale * input_qparams[1].scale  # type: ignore[possibly-undefined]  # pyre-ignore[61]
             ) / output_qparams.scale
 
             build_rescale(
                 tosa_fb=tosa_graph,
                 scale=final_output_scale,
                 # pyre-ignore[61]: Uninitialized local [61]: Local variable `bmm_result` is undefined, or not always defined.
-                input_node=bmm_result,
+                input_node=bmm_result,  # type: ignore[possibly-undefined]
                 output_name=output.name,
                 output_type=ts.DType.INT8,
                 output_shape=bmm_result.shape,
diff --git a/backends/arm/operators/op_cat.py b/backends/arm/operators/op_cat.py
index e249942d0b..f786395cc3 100644
--- a/backends/arm/operators/op_cat.py
+++ b/backends/arm/operators/op_cat.py
@@ -1,4 +1,4 @@
-# Copyright 2024 Arm Limited and/or its affiliates.
+# Copyright 2024-2025 Arm Limited and/or its affiliates.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
@@ -7,7 +7,7 @@
 
 from typing import List
 
-import serializer.tosa_serializer as ts
+import serializer.tosa_serializer as ts  # type: ignore
 from executorch.backends.arm.operators.node_visitor import (
     NodeVisitor,
     register_node_visitor,
diff --git a/backends/arm/operators/op_clamp.py b/backends/arm/operators/op_clamp.py
new file mode 100644
index 0000000000..486da27c9a
--- /dev/null
+++ b/backends/arm/operators/op_clamp.py
@@ -0,0 +1,144 @@
+# Copyright 2025 Arm Limited and/or its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree
+
+from typing import Any, List, Tuple
+
+import serializer.tosa_serializer as ts  # type: ignore
+
+import torch
+from executorch.backends.arm.operators.node_visitor import (
+    NodeVisitor,
+    register_node_visitor,
+)
+
+from executorch.backends.arm.tosa_mapping import TosaArg
+from executorch.backends.arm.tosa_specification import TosaSpecification
+from serializer.tosa_serializer import TosaOp
+from torch.fx import Node
+
+
+@register_node_visitor
+class ClampVisitor_080_BI(NodeVisitor):
+    target = "aten.clamp.default"
+
+    tosa_specs = [
+        TosaSpecification.create_from_string("TOSA-0.80+BI"),
+    ]
+
+    def __init__(self, *args):
+        super().__init__(*args)
+
+    def _create_clamp_node(
+        self,
+        tosa_graph: ts.TosaSerializer,
+        input_name: str,
+        output_name: str,
+        min_int: int,
+        max_int: int,
+        min_fp32: float,
+        max_fp32: float,
+    ) -> None:
+        attr = ts.TosaSerializerAttribute()
+        attr.ClampAttribute(
+            tosa_graph.builder,
+            min_int,
+            max_int,
+            min_fp32,
+            max_fp32,
+        )
+        tosa_graph.addOperator(TosaOp.Op().CLAMP, [input_name], [output_name], attr)
+
+    def _get_min_max_arguments(
+        self, node: Node, dtype_min: int | float, dtype_max: int | float
+    ) -> Tuple[int | float, int | float]:
+
+        def cast_type(value: Any) -> int | float:
+            if isinstance(value, int):
+                return value
+            else:
+                # Attempt to cast to float
+                return float(value)
+
+        assert 2 <= len(node.args) <= 3
+
+        min_arg = dtype_min
+        max_arg = dtype_max
+
+        if node.args[1] is not None:
+            min_arg = cast_type(node.args[1])
+
+        if len(node.args) > 2:
+            if node.args[2] is not None:
+                max_arg = cast_type(node.args[2])
+
+        return min_arg, max_arg
+
+    def define_node(
+        self,
+        node: Node,
+        tosa_graph: ts.TosaSerializer,
+        inputs: List[TosaArg],
+        output: TosaArg,
+    ) -> None:
+        assert len(node.all_input_nodes) == 1
+
+        min_int8, max_int8 = self._get_min_max_arguments(
+            node,
+            torch.iinfo(torch.int8).min,
+            torch.iinfo(torch.int8).max,
+        )
+
+        # NOTE: Quantization of the min/max arguments is handled by QuantizeOperatorArguments
+        self._create_clamp_node(
+            tosa_graph,
+            inputs[0].name,
+            output.name,
+            int(min_int8),
+            int(max_int8),
+            0,
+            0,
+        )
+
+
+@register_node_visitor
+class ClampVisitor_080_MI(ClampVisitor_080_BI):
+    # inheriting 'target' from BI class
+
+    tosa_specs = [
+        TosaSpecification.create_from_string("TOSA-0.80+MI"),
+    ]
+
+    def __init__(self, *args):
+        super().__init__(*args)
+
+    def define_node(
+        self,
+        node: Node,
+        tosa_graph: ts.TosaSerializer,
+        inputs: List[TosaArg],
+        output: TosaArg,
+    ) -> None:
+        assert len(node.all_input_nodes) == 1
+
+        if inputs[0].dtype == ts.DType.INT8:
+            # Call the inherited define_node for handling integers
+            super().define_node(node, tosa_graph, inputs, output)
+        else:
+            min_fp32, max_fp32 = self._get_min_max_arguments(
+                node,
+                torch.finfo(torch.float32).min,
+                torch.finfo(torch.float32).max,
+            )
+
+            self._create_clamp_node(
+                tosa_graph,
+                inputs[0].name,
+                output.name,
+                0,
+                0,
+                min_fp32,
+                max_fp32,
+            )
diff --git a/backends/arm/operators/op_conv2d.py b/backends/arm/operators/op_conv2d.py
index 42156da013..f97e408a02 100644
--- a/backends/arm/operators/op_conv2d.py
+++ b/backends/arm/operators/op_conv2d.py
@@ -1,4 +1,4 @@
-# Copyright 2023-2024 Arm Limited and/or its affiliates.
+# Copyright 2023-2025 Arm Limited and/or its affiliates.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
@@ -6,7 +6,7 @@
 # pyre-unsafe
 from typing import List
 
-import serializer.tosa_serializer as ts
+import serializer.tosa_serializer as ts  # type: ignore
 import torch
 
 # pyre-fixme[21]: 'Could not find a module corresponding to import `executorch.backends.arm._passes.fold_qdq_with_annotated_qparams_pass`.'
@@ -165,13 +165,13 @@ def define_node(
         # integer value domain of the next op. Otherwise return float32 output.
         if inputs[0].dtype == ts.DType.INT8:
             # Get scale_factor from input, weight, and output.
-            input_scale = input_qparams[0].scale  # pyre-ignore [61]
+            input_scale = input_qparams[0].scale  # type: ignore[possibly-undefined]  # pyre-ignore [61]
             weight_scale = input_qparams[1].scale  # pyre-ignore [61]
             output_qargs = get_output_qparams(node)  # pyre-ignore [16]
             build_rescale_conv_output(
                 tosa_graph,
                 # pyre-fixme[61]: Uninitialized local [61]: Local variable `conv2d_res` is undefined, or not always defined.
-                conv2d_res,
+                conv2d_res,  # type: ignore[possibly-undefined]
                 output.name,
                 output.dtype,
                 input_scale,
diff --git a/backends/arm/operators/op_eq.py b/backends/arm/operators/op_eq.py
index e6e2492aec..02fc89099e 100644
--- a/backends/arm/operators/op_eq.py
+++ b/backends/arm/operators/op_eq.py
@@ -9,7 +9,7 @@
 
 import executorch.backends.arm.tosa_quant_utils as tqutils
 
-import serializer.tosa_serializer as ts
+import serializer.tosa_serializer as ts  # type: ignore
 from executorch.backends.arm.operators.node_visitor import (
     NodeVisitor,
     register_node_visitor,
diff --git a/backends/arm/operators/op_exp.py b/backends/arm/operators/op_exp.py
index 46f4980975..4b8232ef6e 100644
--- a/backends/arm/operators/op_exp.py
+++ b/backends/arm/operators/op_exp.py
@@ -1,4 +1,4 @@
-# Copyright 2024 Arm Limited and/or its affiliates.
+# Copyright 2024-2025 Arm Limited and/or its affiliates.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
@@ -6,7 +6,7 @@
 # pyre-unsafe
 from typing import List
 
-import serializer.tosa_serializer as ts
+import serializer.tosa_serializer as ts  # type: ignore
 from executorch.backends.arm.operators.node_visitor import (
     NodeVisitor,
     register_node_visitor,
diff --git a/backends/arm/operators/op_full.py b/backends/arm/operators/op_full.py
index 7964e58226..f06b9873e6 100644
--- a/backends/arm/operators/op_full.py
+++ b/backends/arm/operators/op_full.py
@@ -1,4 +1,4 @@
-# Copyright 2024 Arm Limited and/or its affiliates.
+# Copyright 2024-2025 Arm Limited and/or its affiliates.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
@@ -8,7 +8,7 @@
 
 import numpy as np
 
-import serializer.tosa_serializer as ts
+import serializer.tosa_serializer as ts  # type: ignore
 from executorch.backends.arm.operators.node_visitor import (
     NodeVisitor,
     register_node_visitor,
@@ -40,7 +40,7 @@ def define_node(
         if output.dtype == ts.DType.INT8:
             fill_dtype = np.int8
         else:
-            fill_dtype = np.float32
+            fill_dtype = np.float32  # type: ignore[assignment]
         data = np.full(shape, value, dtype=fill_dtype)
 
         tosa_graph.addConst(shape, output.dtype, data, node.name + "full-const")
diff --git a/backends/arm/operators/op_ge.py b/backends/arm/operators/op_ge.py
index 810b40bb1a..e4de12f332 100644
--- a/backends/arm/operators/op_ge.py
+++ b/backends/arm/operators/op_ge.py
@@ -9,7 +9,7 @@
 
 import executorch.backends.arm.tosa_quant_utils as tqutils
 
-import serializer.tosa_serializer as ts
+import serializer.tosa_serializer as ts  # type: ignore
 from executorch.backends.arm.operators.node_visitor import (
     NodeVisitor,
     register_node_visitor,
diff --git a/backends/arm/operators/op_get_item.py b/backends/arm/operators/op_get_item.py
index f7372262c6..577a8c8d2e 100644
--- a/backends/arm/operators/op_get_item.py
+++ b/backends/arm/operators/op_get_item.py
@@ -1,4 +1,4 @@
-# Copyright 2023-2024 Arm Limited and/or its affiliates.
+# Copyright 2023-2025 Arm Limited and/or its affiliates.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
@@ -6,7 +6,7 @@
 # pyre-unsafe
 from typing import List
 
-import serializer.tosa_serializer as ts
+import serializer.tosa_serializer as ts  # type: ignore
 import torch
 from executorch.backends.arm.operators.node_visitor import (
     NodeVisitor,
diff --git a/backends/arm/operators/op_gt.py b/backends/arm/operators/op_gt.py
index 7a22db6686..65cf8197bd 100644
--- a/backends/arm/operators/op_gt.py
+++ b/backends/arm/operators/op_gt.py
@@ -9,7 +9,7 @@
 
 import executorch.backends.arm.tosa_quant_utils as tqutils
 
-import serializer.tosa_serializer as ts
+import serializer.tosa_serializer as ts  # type: ignore
 from executorch.backends.arm.operators.node_visitor import (
     NodeVisitor,
     register_node_visitor,
diff --git a/backends/arm/operators/op_hardtanh.py b/backends/arm/operators/op_hardtanh.py
index c971b50b66..fc0ee552a9 100644
--- a/backends/arm/operators/op_hardtanh.py
+++ b/backends/arm/operators/op_hardtanh.py
@@ -6,7 +6,7 @@
 # pyre-unsafe
 from typing import List
 
-import serializer.tosa_serializer as ts
+import serializer.tosa_serializer as ts  # type: ignore
 import torch
 
 # pyre-fixme[21]: 'Could not find a module corresponding to import `executorch.backends.arm._passes.fold_qdq_with_annotated_qparams_pass`.'
diff --git a/backends/arm/operators/op_le.py b/backends/arm/operators/op_le.py
index ee6929617e..8fea2b9208 100644
--- a/backends/arm/operators/op_le.py
+++ b/backends/arm/operators/op_le.py
@@ -9,7 +9,7 @@
 
 import executorch.backends.arm.tosa_quant_utils as tqutils
 
-import serializer.tosa_serializer as ts
+import serializer.tosa_serializer as ts  # type: ignore
 from executorch.backends.arm.operators.node_visitor import (
     NodeVisitor,
     register_node_visitor,
diff --git a/backends/arm/operators/op_log.py b/backends/arm/operators/op_log.py
index 868eeb9443..7f664900b3 100644
--- a/backends/arm/operators/op_log.py
+++ b/backends/arm/operators/op_log.py
@@ -1,4 +1,4 @@
-# Copyright 2024 Arm Limited and/or its affiliates.
+# Copyright 2024-2025 Arm Limited and/or its affiliates.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
@@ -6,7 +6,7 @@
 # pyre-unsafe
 from typing import List
 
-import serializer.tosa_serializer as ts
+import serializer.tosa_serializer as ts  # type: ignore
 from executorch.backends.arm.operators.node_visitor import (
     NodeVisitor,
     register_node_visitor,
diff --git a/backends/arm/operators/op_lt.py b/backends/arm/operators/op_lt.py
index 20bac97af4..da93ab4179 100644
--- a/backends/arm/operators/op_lt.py
+++ b/backends/arm/operators/op_lt.py
@@ -9,7 +9,7 @@
 
 import executorch.backends.arm.tosa_quant_utils as tqutils
 
-import serializer.tosa_serializer as ts
+import serializer.tosa_serializer as ts  # type: ignore
 from executorch.backends.arm.operators.node_visitor import (
     NodeVisitor,
     register_node_visitor,
diff --git a/backends/arm/operators/op_max.py b/backends/arm/operators/op_max.py
index 660a2cf0af..35a635de13 100644
--- a/backends/arm/operators/op_max.py
+++ b/backends/arm/operators/op_max.py
@@ -1,4 +1,4 @@
-# Copyright 2024 Arm Limited and/or its affiliates.
+# Copyright 2024-2025 Arm Limited and/or its affiliates.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
@@ -8,7 +8,7 @@
 from typing import List
 
 import executorch.backends.arm.tosa_quant_utils as tqutils
-import serializer.tosa_serializer as ts
+import serializer.tosa_serializer as ts  # type: ignore
 
 # pyre-fixme[21]: 'Could not find a module corresponding to import `executorch.backends.arm._passes.fold_qdq_with_annotated_qparams_pass`.'
 from executorch.backends.arm._passes.fold_qdq_with_annotated_qparams_pass import (
diff --git a/backends/arm/operators/op_max_pool2d.py b/backends/arm/operators/op_max_pool2d.py
index 6cb5f0490e..f32300f561 100644
--- a/backends/arm/operators/op_max_pool2d.py
+++ b/backends/arm/operators/op_max_pool2d.py
@@ -1,4 +1,4 @@
-# Copyright 2024 Arm Limited and/or its affiliates.
+# Copyright 2024-2025 Arm Limited and/or its affiliates.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
@@ -6,7 +6,7 @@
 # pyre-unsafe
 from typing import List
 
-import serializer.tosa_serializer as ts
+import serializer.tosa_serializer as ts  # type: ignore
 import torch
 
 # pyre-fixme[21]: 'Could not find a module corresponding to import `executorch.backends.arm._passes.fold_qdq_with_annotated_qparams_pass`.'
diff --git a/backends/arm/operators/op_min.py b/backends/arm/operators/op_min.py
index 2282d9e1cf..a409acf1ae 100644
--- a/backends/arm/operators/op_min.py
+++ b/backends/arm/operators/op_min.py
@@ -1,4 +1,4 @@
-# Copyright 2024 Arm Limited and/or its affiliates.
+# Copyright 2024-2025 Arm Limited and/or its affiliates.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
@@ -9,7 +9,7 @@
 
 import executorch.backends.arm.tosa_quant_utils as tqutils
 
-import serializer.tosa_serializer as ts
+import serializer.tosa_serializer as ts  # type: ignore
 
 # pyre-fixme[21]: 'Could not find a module corresponding to import `executorch.backends.arm._passes.fold_qdq_with_annotated_qparams_pass`.'
 from executorch.backends.arm._passes.fold_qdq_with_annotated_qparams_pass import (
diff --git a/backends/arm/operators/op_mul.py b/backends/arm/operators/op_mul.py
index c6a315d445..ef886de11e 100644
--- a/backends/arm/operators/op_mul.py
+++ b/backends/arm/operators/op_mul.py
@@ -1,4 +1,4 @@
-# Copyright 2024 Arm Limited and/or its affiliates.
+# Copyright 2024-2025 Arm Limited and/or its affiliates.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
@@ -10,7 +10,7 @@
 import executorch.backends.arm.tosa_quant_utils as tqutils
 import executorch.backends.arm.tosa_utils as tutils
 
-import serializer.tosa_serializer as ts
+import serializer.tosa_serializer as ts  # type: ignore
 import torch
 
 # pyre-fixme[21]: 'Could not find a module corresponding to import `executorch.backends.arm._passes.fold_qdq_with_annotated_qparams_pass`.'
diff --git a/backends/arm/operators/op_permute.py b/backends/arm/operators/op_permute.py
index 16d3d4a04e..103ae1b9a2 100644
--- a/backends/arm/operators/op_permute.py
+++ b/backends/arm/operators/op_permute.py
@@ -1,4 +1,4 @@
-# Copyright 2023-2024 Arm Limited and/or its affiliates.
+# Copyright 2023-2025 Arm Limited and/or its affiliates.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
@@ -7,7 +7,7 @@
 
 from typing import List
 
-import serializer.tosa_serializer as ts
+import serializer.tosa_serializer as ts  # type: ignore
 import torch
 from executorch.backends.arm.operators.node_visitor import (
     NodeVisitor,
diff --git a/backends/arm/operators/op_reciprocal.py b/backends/arm/operators/op_reciprocal.py
index 121b78fed6..5410e1dd99 100644
--- a/backends/arm/operators/op_reciprocal.py
+++ b/backends/arm/operators/op_reciprocal.py
@@ -1,4 +1,4 @@
-# Copyright 2023-2024 Arm Limited and/or its affiliates.
+# Copyright 2023-2025 Arm Limited and/or its affiliates.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
@@ -6,7 +6,7 @@
 # pyre-unsafe
 from typing import List
 
-import serializer.tosa_serializer as ts
+import serializer.tosa_serializer as ts  # type: ignore
 import torch
 from executorch.backends.arm.operators.node_visitor import (
     NodeVisitor,
diff --git a/backends/arm/operators/op_relu.py b/backends/arm/operators/op_relu.py
index b5ffa2aa70..c37e4b3e75 100644
--- a/backends/arm/operators/op_relu.py
+++ b/backends/arm/operators/op_relu.py
@@ -5,7 +5,7 @@
 
 # pyre-unsafe
 
-import serializer.tosa_serializer as ts
+import serializer.tosa_serializer as ts  # type: ignore
 import torch.fx
 
 # pyre-fixme[21]: 'Could not find a module corresponding to import `executorch.backends.arm._passes.fold_qdq_with_annotated_qparams_pass`.'
diff --git a/backends/arm/operators/op_repeat.py b/backends/arm/operators/op_repeat.py
index fd76a52052..b97d7023ef 100644
--- a/backends/arm/operators/op_repeat.py
+++ b/backends/arm/operators/op_repeat.py
@@ -1,11 +1,11 @@
-# Copyright 2024 Arm Limited and/or its affiliates.
+# Copyright 2024-2025 Arm Limited and/or its affiliates.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
 # pyre-unsafe
 
-import serializer.tosa_serializer as ts
+import serializer.tosa_serializer as ts  # type: ignore
 import torch
 from executorch.backends.arm.operators.node_visitor import (
     NodeVisitor,
diff --git a/backends/arm/operators/op_rshift.py b/backends/arm/operators/op_rshift.py
index 2c1f4d5bbe..ac61cca6a9 100644
--- a/backends/arm/operators/op_rshift.py
+++ b/backends/arm/operators/op_rshift.py
@@ -1,4 +1,4 @@
-# Copyright 2024 Arm Limited and/or its affiliates.
+# Copyright 2024-2025 Arm Limited and/or its affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the BSD-style license found in the
@@ -6,7 +6,7 @@
 
 from typing import List
 
-import serializer.tosa_serializer as ts
+import serializer.tosa_serializer as ts  # type: ignore
 import torch
 from executorch.backends.arm.operators.node_visitor import (
     NodeVisitor,
diff --git a/backends/arm/operators/op_rsqrt.py b/backends/arm/operators/op_rsqrt.py
index 1cc3e8fcff..0fbb203b08 100644
--- a/backends/arm/operators/op_rsqrt.py
+++ b/backends/arm/operators/op_rsqrt.py
@@ -1,4 +1,4 @@
-# Copyright 2024 Arm Limited and/or its affiliates.
+# Copyright 2024-2025 Arm Limited and/or its affiliates.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
@@ -6,7 +6,7 @@
 # pyre-unsafe
 from typing import List
 
-import serializer.tosa_serializer as ts
+import serializer.tosa_serializer as ts  # type: ignore
 import torch
 from executorch.backends.arm.operators.node_visitor import (
     NodeVisitor,
diff --git a/backends/arm/operators/op_sigmoid.py b/backends/arm/operators/op_sigmoid.py
index 0c28c0ed00..118c813dcf 100644
--- a/backends/arm/operators/op_sigmoid.py
+++ b/backends/arm/operators/op_sigmoid.py
@@ -1,4 +1,4 @@
-# Copyright 2024 Arm Limited and/or its affiliates.
+# Copyright 2024-2025 Arm Limited and/or its affiliates.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
@@ -6,7 +6,7 @@
 # pyre-unsafe
 from typing import List
 
-import serializer.tosa_serializer as ts
+import serializer.tosa_serializer as ts  # type: ignore
 from executorch.backends.arm.operators.node_visitor import (
     NodeVisitor,
     register_node_visitor,
diff --git a/backends/arm/operators/op_slice.py b/backends/arm/operators/op_slice.py
index 9327e005b6..7f4804af58 100644
--- a/backends/arm/operators/op_slice.py
+++ b/backends/arm/operators/op_slice.py
@@ -1,4 +1,4 @@
-# Copyright 2024 Arm Limited and/or its affiliates.
+# Copyright 2024-2025 Arm Limited and/or its affiliates.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
@@ -7,7 +7,7 @@
 
 from typing import List
 
-import serializer.tosa_serializer as ts
+import serializer.tosa_serializer as ts  # type: ignore
 from executorch.backends.arm.operators.node_visitor import (
     NodeVisitor,
     register_node_visitor,
diff --git a/backends/arm/operators/op_sub.py b/backends/arm/operators/op_sub.py
index 0c569a6ffd..6cd422095a 100644
--- a/backends/arm/operators/op_sub.py
+++ b/backends/arm/operators/op_sub.py
@@ -1,4 +1,4 @@
-# Copyright 2023-2024 Arm Limited and/or its affiliates.
+# Copyright 2023-2025 Arm Limited and/or its affiliates.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
@@ -10,7 +10,7 @@
 import executorch.backends.arm.tosa_quant_utils as tqutils
 import executorch.backends.arm.tosa_utils as tutils
 
-import serializer.tosa_serializer as ts
+import serializer.tosa_serializer as ts  # type: ignore
 from executorch.backends.arm.operators.node_visitor import (
     NodeVisitor,
     register_node_visitor,
@@ -75,7 +75,7 @@ def define_node(
         if output.dtype == ts.DType.INT8:
             # Scale output back to 8 bit
             # pyre-ignore
-            tqutils.insert_rescale_op_to_int8(tosa_graph, sub_output, scale_back, node)
+            tqutils.insert_rescale_op_to_int8(tosa_graph, sub_output, scale_back, node)  # type: ignore[possibly-undefined]
 
 
 @register_node_visitor
diff --git a/backends/arm/operators/op_sum.py b/backends/arm/operators/op_sum.py
index dcc194a656..b5b388b335 100644
--- a/backends/arm/operators/op_sum.py
+++ b/backends/arm/operators/op_sum.py
@@ -1,4 +1,4 @@
-# Copyright 2023-2024 Arm Limited and/or its affiliates.
+# Copyright 2023-2025 Arm Limited and/or its affiliates.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
@@ -10,7 +10,7 @@
 import executorch.backends.arm.tosa_quant_utils as tqutils
 import executorch.backends.arm.tosa_utils as tutils
 
-import serializer.tosa_serializer as ts
+import serializer.tosa_serializer as ts  # type: ignore
 from executorch.backends.arm.operators.node_visitor import (
     NodeVisitor,
     register_node_visitor,
diff --git a/backends/arm/operators/op_table.py b/backends/arm/operators/op_table.py
index bfaaf4578e..b411d8b91b 100644
--- a/backends/arm/operators/op_table.py
+++ b/backends/arm/operators/op_table.py
@@ -1,4 +1,4 @@
-# Copyright 2024 Arm Limited and/or its affiliates.
+# Copyright 2024-2025 Arm Limited and/or its affiliates.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
@@ -9,7 +9,7 @@
 
 import numpy as np
 
-import serializer.tosa_serializer as ts
+import serializer.tosa_serializer as ts  # type: ignore
 import torch
 from executorch.backends.arm.operators.node_visitor import (
     NodeVisitor,
@@ -21,7 +21,7 @@
 
 @register_node_visitor
 class TableVisitor(NodeVisitor):
-    target = "_table"
+    target = "_table.default"
 
     def define_node(
         self,
@@ -30,9 +30,9 @@ def define_node(
         inputs: List[TosaArg],
         output: TosaArg,
     ) -> None:
-        assert node.name in self._exported_program.state_dict.keys()
+        assert node.name in self._exported_program.state_dict.keys()  # type: ignore[union-attr]
         assert inputs[0].dtype == output.dtype == ts.DType.INT8
-        table = self._exported_program.state_dict[node.name]
+        table = self._exported_program.state_dict[node.name]  # type: ignore[union-attr]
         table_attr = ts.TosaSerializerAttribute()
         table_attr.TableAttribute(np.array(table))
         tosa_graph.addOperator(
diff --git a/backends/arm/operators/op_tanh.py b/backends/arm/operators/op_tanh.py
index a1e91be4ff..7961b14f2a 100644
--- a/backends/arm/operators/op_tanh.py
+++ b/backends/arm/operators/op_tanh.py
@@ -1,4 +1,4 @@
-# Copyright 2024 Arm Limited and/or its affiliates.
+# Copyright 2024-2025 Arm Limited and/or its affiliates.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
@@ -6,7 +6,7 @@
 # pyre-unsafe
 from typing import List
 
-import serializer.tosa_serializer as ts
+import serializer.tosa_serializer as ts  # type: ignore
 from executorch.backends.arm.operators.node_visitor import (
     NodeVisitor,
     register_node_visitor,
diff --git a/backends/arm/operators/op_to_copy.py b/backends/arm/operators/op_to_copy.py
index 256e54f3a2..feaec3a41e 100644
--- a/backends/arm/operators/op_to_copy.py
+++ b/backends/arm/operators/op_to_copy.py
@@ -1,4 +1,4 @@
-# Copyright 2024 Arm Limited and/or its affiliates.
+# Copyright 2024-2025 Arm Limited and/or its affiliates.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
@@ -6,9 +6,9 @@
 # pyre-unsafe
 from typing import List
 
-import serializer.tosa_serializer as ts
+import serializer.tosa_serializer as ts  # type: ignore
 import torch
-import tosa.Op as TosaOp
+import tosa.Op as TosaOp  # type: ignore
 
 from executorch.backends.arm.operators.node_visitor import (
     NodeVisitor,
diff --git a/backends/arm/operators/op_to_dim_order_copy.py b/backends/arm/operators/op_to_dim_order_copy.py
index c2ec620b82..397979a439 100644
--- a/backends/arm/operators/op_to_dim_order_copy.py
+++ b/backends/arm/operators/op_to_dim_order_copy.py
@@ -6,9 +6,9 @@
 # pyre-unsafe
 from typing import List
 
-import serializer.tosa_serializer as ts
+import serializer.tosa_serializer as ts  # type: ignore
 import torch
-import tosa.Op as TosaOp
+import tosa.Op as TosaOp  # type: ignore
 
 from executorch.backends.arm.operators.node_visitor import (
     NodeVisitor,
diff --git a/backends/arm/operators/op_transpose.py b/backends/arm/operators/op_transpose.py
index 42675be34b..54a79297dd 100644
--- a/backends/arm/operators/op_transpose.py
+++ b/backends/arm/operators/op_transpose.py
@@ -1,4 +1,4 @@
-# Copyright 2024 Arm Limited and/or its affiliates.
+# Copyright 2024-2025 Arm Limited and/or its affiliates.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
@@ -7,7 +7,7 @@
 
 from typing import List
 
-import serializer.tosa_serializer as ts
+import serializer.tosa_serializer as ts  # type: ignore
 import torch
 from executorch.backends.arm.operators.node_visitor import (
     NodeVisitor,
@@ -25,7 +25,7 @@ class TransposeVisitor(NodeVisitor):
     Inserts a TOSA TRANSPOSE.
     """
 
-    target = "_transpose"
+    target = "_transpose.default"
 
     def define_node(
         self,
diff --git a/backends/arm/operators/op_upsample_nearest2d.py b/backends/arm/operators/op_upsample_nearest2d.py
index 68fcb521d9..38e4087d38 100644
--- a/backends/arm/operators/op_upsample_nearest2d.py
+++ b/backends/arm/operators/op_upsample_nearest2d.py
@@ -1,4 +1,4 @@
-# Copyright 2024 Arm Limited and/or its affiliates.
+# Copyright 2024-2025 Arm Limited and/or its affiliates.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
@@ -6,7 +6,7 @@
 # pyre-unsafe
 from typing import List
 
-import serializer.tosa_serializer as ts
+import serializer.tosa_serializer as ts  # type: ignore
 import torch
 from executorch.backends.arm.operators.node_visitor import (
     NodeVisitor,
@@ -16,7 +16,7 @@
 from executorch.backends.arm.tosa_utils import get_resize_parameters, tosa_shape
 from serializer.tosa_serializer import TosaOp
 
-from tosa.ResizeMode import ResizeMode
+from tosa.ResizeMode import ResizeMode  # type: ignore
 
 
 @register_node_visitor
diff --git a/backends/arm/operators/op_view.py b/backends/arm/operators/op_view.py
index 3489795ed5..119e32fa58 100644
--- a/backends/arm/operators/op_view.py
+++ b/backends/arm/operators/op_view.py
@@ -1,4 +1,4 @@
-# Copyright 2023-2024 Arm Limited and/or its affiliates.
+# Copyright 2023-2025 Arm Limited and/or its affiliates.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
@@ -6,9 +6,9 @@
 # pyre-unsafe
 from typing import List
 
-import serializer.tosa_serializer as ts
+import serializer.tosa_serializer as ts  # type: ignore
 import torch
-import tosa.Op as TosaOp
+import tosa.Op as TosaOp  # type: ignore
 
 from executorch.backends.arm.operators.node_visitor import (
     NodeVisitor,
diff --git a/backends/arm/process_node.py b/backends/arm/process_node.py
index 36a1567df9..a83ead987e 100644
--- a/backends/arm/process_node.py
+++ b/backends/arm/process_node.py
@@ -8,7 +8,7 @@
 from typing import cast, Dict
 
 import numpy as np
-import serializer.tosa_serializer as ts
+import serializer.tosa_serializer as ts  # type: ignore
 import torch
 import torch.fx
 from executorch.backends.arm.operators.node_visitor import NodeVisitor
@@ -36,9 +36,9 @@ def process_call_function(
 
     # Visiting each Node
     # pyre-ignore[16]: Undefined attribute.
-    if node.target.__name__ in node_visitors:
+    if node.target.__name__ in node_visitors:  # type: ignore[union-attr]
         # pyre-ignore[16]: Undefined attribute.
-        node_visitors[node.target.__name__].define_node(
+        node_visitors[node.target.__name__].define_node(  # type: ignore[union-attr]
             node,
             tosa_graph,
             inputs,
diff --git a/backends/arm/quantizer/arm_quantizer.py b/backends/arm/quantizer/arm_quantizer.py
index cba66cfe56..c1a017fa1d 100644
--- a/backends/arm/quantizer/arm_quantizer.py
+++ b/backends/arm/quantizer/arm_quantizer.py
@@ -20,8 +20,12 @@
 from executorch.backends.arm._passes.arm_pass_manager import ArmPassManager
 
 from executorch.backends.arm.quantizer import arm_quantizer_utils
-from executorch.backends.arm.quantizer.arm_quantizer_utils import mark_node_as_annotated
-from executorch.backends.arm.quantizer.quantization_annotator import annotate_graph
+from executorch.backends.arm.quantizer.arm_quantizer_utils import (  # type: ignore[attr-defined]
+    mark_node_as_annotated,
+)
+from executorch.backends.arm.quantizer.quantization_annotator import (  # type: ignore[import-not-found]
+    annotate_graph,
+)
 
 from executorch.backends.arm.quantizer.quantization_config import QuantizationConfig
 from executorch.backends.arm.tosa_specification import TosaSpecification
@@ -253,7 +257,7 @@ def transform_for_annotation(self, model: GraphModule) -> GraphModule:
         Currently transforms scalar values to tensor attributes.
         """
 
-        return ArmPassManager(self.tosa_spec).transform_for_annotation_pipeline(
+        return ArmPassManager(self.tosa_spec).transform_for_annotation_pipeline(  # type: ignore[arg-type]
             graph_module=model
         )
 
diff --git a/backends/arm/quantizer/quantization_annotator.py b/backends/arm/quantizer/quantization_annotator.py
index f2a124f279..32f64963e8 100644
--- a/backends/arm/quantizer/quantization_annotator.py
+++ b/backends/arm/quantizer/quantization_annotator.py
@@ -55,7 +55,7 @@ def _is_ok_for_quantization(
 
     for n_arg in _as_list(node.args[quant_property.index]):
         assert isinstance(n_arg, Node)
-        if not arm_quantizer_utils.is_ok_for_quantization(n_arg, gm):
+        if not arm_quantizer_utils.is_ok_for_quantization(n_arg, gm):  # type: ignore[attr-defined]
             return False
 
     return True
@@ -77,7 +77,7 @@ def _annotate_input(node: Node, quant_property: _QuantProperty):
         assert isinstance(n_arg, Node)
         _annotate_input_qspec_map(node, n_arg, qspec)
         if quant_property.mark_annotated:
-            arm_quantizer_utils.mark_node_as_annotated(n_arg)
+            arm_quantizer_utils.mark_node_as_annotated(n_arg)  # type: ignore[attr-defined]
 
 
 def _annotate_output(node: Node, quant_property: _QuantProperty):
@@ -107,7 +107,7 @@ def _match_pattern(
         child = next(iter(node.users))
     elif node.target in pattern[1]:
         assert len(node.args) != 0
-        parent = node.args[0]
+        parent = node.args[0]  # type: ignore[assignment]
         child = node
     else:
         return False
@@ -132,6 +132,8 @@ def _match_pattern(
     torch.ops.aten.sigmoid.default,
     torch.ops.aten.tanh.default,
     torch.ops.aten.sum.dim_IntList,
+    torch.ops.aten.hardsigmoid.default,
+    torch.ops.aten.hardswish.default,
 ]
 
 _one_to_one_shared_input_qspec = [
@@ -186,6 +188,8 @@ def _match_pattern(
     torch.ops.aten.full.default,
     torch.ops.aten.flatten.using_ints,
     torch.ops.aten.dropout.default,
+    torch.ops.aten.clamp.default,
+    torch.ops.aten.clamp.Tensor,
     operator.getitem,
 ]
 
@@ -259,23 +263,23 @@ def any_or_hardtanh_min_zero(n: Node):
         torch.ops.aten.minimum.default,
         torch.ops.aten.maximum.default,
     ):
-        shared_qspec = SharedQuantizationSpec((node.args[0], node))
+        shared_qspec = SharedQuantizationSpec((node.args[0], node))  # type: ignore[arg-type]
         quant_properties.quant_inputs = [
             _QuantProperty(0, input_act_qspec),
             _QuantProperty(
-                1, input_act_qspec if node.args[0] == node.args[1] else shared_qspec
+                1, input_act_qspec if node.args[0] == node.args[1] else shared_qspec  # type: ignore[arg-type]
             ),
         ]
-        quant_properties.quant_output = _QuantProperty(0, shared_qspec)
+        quant_properties.quant_output = _QuantProperty(0, shared_qspec)  # type: ignore[arg-type]
     elif node.target == torch.ops.aten.adaptive_avg_pool2d.default:
         input_qspec = (
-            SharedQuantizationSpec(node.args[0])
-            if arm_quantizer_utils.is_output_annotated(node.args[0])
+            SharedQuantizationSpec(node.args[0])  # type: ignore[arg-type]
+            if arm_quantizer_utils.is_output_annotated(node.args[0])  # type: ignore
             else input_act_qspec
         )
-        quant_properties.quant_inputs = [_QuantProperty(0, input_qspec)]
+        quant_properties.quant_inputs = [_QuantProperty(0, input_qspec)]  # type: ignore[arg-type]
         quant_properties.quant_output = _QuantProperty(
-            0, SharedQuantizationSpec((node.args[0], node))
+            0, SharedQuantizationSpec((node.args[0], node))  # type: ignore[arg-type]
         )
     elif node.target in (
         torch.ops.aten.cat.default,
@@ -290,19 +294,19 @@ def any_or_hardtanh_min_zero(n: Node):
             _QuantProperty(
                 0,
                 [
-                    input_act_qspec if n == node.args[0][0] else shared_qspec
+                    input_act_qspec if n == node.args[0][0] else shared_qspec  # type: ignore[misc]
                     for n in node.args[0]
                 ],
             )
         ]
-        quant_properties.quant_output = _QuantProperty(0, shared_qspec)
+        quant_properties.quant_output = _QuantProperty(0, shared_qspec)  # type: ignore[arg-type]
     elif node.target in _one_to_one:
         quant_properties.quant_inputs = [_QuantProperty(0, input_act_qspec)]
         quant_properties.quant_output = _QuantProperty(0, output_act_qspec)
     elif node.target in _one_to_one_shared_input_qspec:
         quant_properties.quant_inputs = [_QuantProperty(0, input_act_qspec)]
         quant_properties.quant_output = _QuantProperty(
-            0, SharedQuantizationSpec((node.args[0], node))
+            0, SharedQuantizationSpec((node.args[0], node))  # type: ignore[arg-type]
         )
     elif node.target in [
         torch.ops.aten.eq.Tensor,
@@ -311,26 +315,26 @@ def any_or_hardtanh_min_zero(n: Node):
         torch.ops.aten.le.Tensor,
         torch.ops.aten.lt.Tensor,
     ]:
-        shared_qspec = SharedQuantizationSpec((node.args[0], node))
+        shared_qspec = SharedQuantizationSpec((node.args[0], node))  # type: ignore[arg-type]
         quant_properties.quant_inputs = [
             _QuantProperty(0, input_act_qspec),
             _QuantProperty(
-                1, input_act_qspec if node.args[0] == node.args[1] else shared_qspec
+                1, input_act_qspec if node.args[0] == node.args[1] else shared_qspec  # type: ignore[arg-type]
             ),
         ]
         quant_properties.quant_output = None
     elif node.target in _parent_shared_qspec:
         if not isinstance(node.args[0], Node):
-            return None
+            return None  # type: ignore[return-value]
 
-        if not arm_quantizer_utils.is_output_annotated(node.args[0]):
-            return None
+        if not arm_quantizer_utils.is_output_annotated(node.args[0]):  # type: ignore[attr-defined]
+            return None  # type: ignore[return-value]
 
         shared_qspec = SharedQuantizationSpec(node.args[0])
-        quant_properties.quant_inputs = [_QuantProperty(0, shared_qspec)]
-        quant_properties.quant_output = _QuantProperty(0, shared_qspec)
+        quant_properties.quant_inputs = [_QuantProperty(0, shared_qspec)]  # type: ignore[arg-type]
+        quant_properties.quant_output = _QuantProperty(0, shared_qspec)  # type: ignore[arg-type]
     else:
-        return None
+        return None  # type: ignore[return-value]
 
     # Don't check if operator.getitem is ok for quantization, it's always ok
     if node.target == operator.getitem:
@@ -340,16 +344,16 @@ def any_or_hardtanh_min_zero(n: Node):
     # provided QuantProperties
     for quant_property in quant_properties.quant_inputs:
         if not _is_ok_for_quantization(node, quant_property, gm):
-            return None
+            return None  # type: ignore[return-value]
 
     if quant_properties.quant_output is not None:
         if not _is_ok_for_quantization(node, quant_properties.quant_output, gm):
-            return None
+            return None  # type: ignore[return-value]
 
     return quant_properties
 
 
-def annotate_graph(
+def annotate_graph(  # type: ignore[return]
     gm: torch.fx.GraphModule,
     quantization_config: QuantizationConfig,
     filter_fn: Optional[Callable[[Node], bool]] = None,
@@ -374,4 +378,4 @@ def annotate_graph(
         if quant_properties.quant_output is not None:
             _annotate_output(node, quant_properties.quant_output)
 
-        arm_quantizer_utils.mark_node_as_annotated(node)
+        arm_quantizer_utils.mark_node_as_annotated(node)  # type: ignore[attr-defined]
diff --git a/backends/arm/quantizer/quantization_config.py b/backends/arm/quantizer/quantization_config.py
index b94d9bda64..394995201e 100644
--- a/backends/arm/quantizer/quantization_config.py
+++ b/backends/arm/quantizer/quantization_config.py
@@ -82,14 +82,14 @@ def _derive_qparams_fn(
             input_act = node.args[0]
             weight = node.args[1]
             quantization_spec = DerivedQuantizationSpec(
-                derived_from=[(input_act, node), (weight, node)],
+                derived_from=[(input_act, node), (weight, node)],  # type: ignore[list-item]
                 derive_qparams_fn=_derive_qparams_fn,
                 dtype=torch.int32,
                 quant_min=torch.iinfo(torch.int32).min,
                 quant_max=torch.iinfo(torch.int32).max - 1,
                 qscheme=torch.per_tensor_symmetric,
             )
-            return quantization_spec
+            return quantization_spec  # type: ignore[return-value]
 
         if self.bias is None:
             return None
diff --git a/backends/arm/scripts/pre-commit b/backends/arm/scripts/pre-commit
new file mode 100755
index 0000000000..2000585f93
--- /dev/null
+++ b/backends/arm/scripts/pre-commit
@@ -0,0 +1,13 @@
+#!/bin/bash
+# Copyright 2025 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# Check 1: If commit header contains WIP, everything is ok
+git rev-list --format=%s --max-count=1 HEAD | grep -q WIP && exit 0
+
+# Check 2: lintunner on latest patch.
+lintrunner -a --revision 'HEAD^' --skip MYPY
+commit_files=$(git diff-tree --no-commit-id --name-only --diff-filter=M HEAD -r)
+git add $commit_files || true
\ No newline at end of file
diff --git a/backends/arm/scripts/pre-push b/backends/arm/scripts/pre-push
new file mode 100755
index 0000000000..c51138b8ec
--- /dev/null
+++ b/backends/arm/scripts/pre-push
@@ -0,0 +1,45 @@
+#!/bin/bash
+# Copyright 2025 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# Check 1: If commit header contains WIP, everything is ok
+git rev-list --format=%s --max-count=1 HEAD | grep -q WIP && exit 0
+
+# Check 2: lintunner on latest patches.
+lintrunner --revision 'HEAD^'
+if [[ $? != 0 ]]
+	then
+	echo "Failed linting"
+	exit 1
+fi
+
+# Check 3: License headers
+# We do a simple check of if all committed headers contain "$current_year Arm".
+# This does not guarantee OK in ci but should be ok most of the time.
+
+current_year=$(date +%Y)
+failed_license_check=false
+commit_files=$(git diff-tree --no-commit-id --name-only --diff-filter=ACMR HEAD -r)
+
+
+for commited_file in $commit_files; do
+	head $commited_file | grep -q "$current_year Arm"
+	if [[ $? != 0 ]]
+		then
+			echo "Header in $commited_file did not contain '$current_year Arm'"
+			failed_license_check=true
+		else
+			echo "$commited_file passed license check"
+	fi
+done
+
+if [[ $failed_license_check == true ]]
+	then
+		exit 1
+	else
+		echo "Passed simple license check"
+fi
+
+exit 0
diff --git a/backends/arm/scripts/setup-dev-env.sh b/backends/arm/scripts/setup-dev-env.sh
new file mode 100755
index 0000000000..b8c9b3b44c
--- /dev/null
+++ b/backends/arm/scripts/setup-dev-env.sh
@@ -0,0 +1,9 @@
+#!/bin/bash
+# Copyright 2025 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+git_dir=$(git rev-parse --git-dir)
+ln $git_dir/../backends/arm/scripts/pre-push $git_dir/hooks
+ln $git_dir/../backends/arm/scripts/pre-commit $git_dir/hooks
\ No newline at end of file
diff --git a/backends/arm/test/common.py b/backends/arm/test/common.py
index 7ebf89e392..091b2d5f26 100644
--- a/backends/arm/test/common.py
+++ b/backends/arm/test/common.py
@@ -9,9 +9,17 @@
 
 import tempfile
 from datetime import datetime
+
 from pathlib import Path
+from typing import Any
 
+import pytest
 from executorch.backends.arm.arm_backend import ArmCompileSpecBuilder
+from executorch.backends.arm.test.runner_utils import (
+    arm_executor_runner_exists,
+    corstone300_installed,
+    corstone320_installed,
+)
 from executorch.backends.arm.tosa_specification import TosaSpecification
 from executorch.exir.backend.compile_spec_schema import CompileSpec
 
@@ -41,8 +49,8 @@ def maybe_get_tosa_collate_path() -> str | None:
     if tosa_test_base:
         current_test = os.environ.get("PYTEST_CURRENT_TEST")
         #'backends/arm/test/ops/test_mean_dim.py::TestMeanDim::test_meandim_tosa_BI_0_zeros (call)'
-        test_class = current_test.split("::")[1]
-        test_name = current_test.split("::")[-1].split(" ")[0]
+        test_class = current_test.split("::")[1]  # type: ignore[union-attr]
+        test_name = current_test.split("::")[-1].split(" ")[0]  # type: ignore[union-attr]
         if "BI" in test_name:
             tosa_test_base = os.path.join(tosa_test_base, "tosa-bi")
         elif "MI" in test_name:
@@ -100,7 +108,7 @@ def get_u85_compile_spec(
     """
     Default compile spec for Ethos-U85 tests.
     """
-    return get_u85_compile_spec_unbuilt(
+    return get_u85_compile_spec_unbuilt(  # type: ignore[attr-defined]
         custom_path=custom_path,
     ).build()
 
@@ -144,4 +152,45 @@ def get_u85_compile_spec_unbuilt(
         )
         .dump_intermediate_artifacts_to(artifact_path)
     )
-    return compile_spec
+    return compile_spec  # type: ignore[return-value]
+
+
+SkipIfNoCorstone300 = pytest.mark.skipif(
+    not corstone300_installed() or not arm_executor_runner_exists("corstone-300"),
+    reason="Did not find Corstone-300 FVP or executor_runner on path",
+)
+"""Skips a test if Corsone300 FVP is not installed, or if the executor runner is not built"""
+
+SkipIfNoCorstone320 = pytest.mark.skipif(
+    not corstone320_installed() or not arm_executor_runner_exists("corstone-320"),
+    reason="Did not find Corstone-320 FVP or executor_runner on path",
+)
+"""Skips a test if Corsone320 FVP is not installed, or if the executor runner is not built."""
+
+
+def parametrize(
+    arg_name: str, test_data: dict[str, Any], xfails: dict[str, str] = None
+):
+    """
+    Custom version of pytest.mark.parametrize with some syntatic sugar and added xfail functionality
+        - test_data is expected as a dict of (id, test_data) pairs
+        - alllows to specifiy a dict of (id, failure_reason) pairs to mark specific tests as xfail
+    """
+    if xfails is None:
+        xfails = {}
+
+    def decorator_func(func):
+        """Test data is transformed from a dict of (id, data) pairs to a list of pytest params to work with the native pytests parametrize function"""
+        pytest_testsuite = []
+        for id, test_parameters in test_data.items():
+            if id in xfails:
+                pytest_param = pytest.param(
+                    test_parameters, id=id, marks=pytest.mark.xfail(reason=xfails[id])
+                )
+            else:
+                pytest_param = pytest.param(test_parameters, id=id)
+            pytest_testsuite.append(pytest_param)
+
+        return pytest.mark.parametrize(arg_name, pytest_testsuite)(func)
+
+    return decorator_func
diff --git a/backends/arm/test/misc/test_debug_feats.py b/backends/arm/test/misc/test_debug_feats.py
index a9491418a4..690549d717 100644
--- a/backends/arm/test/misc/test_debug_feats.py
+++ b/backends/arm/test/misc/test_debug_feats.py
@@ -48,7 +48,7 @@ def _tosa_MI_pipeline(self, module: torch.nn.Module, dump_file=None):
         (
             ArmTester(
                 module,
-                example_inputs=module.get_inputs(),
+                example_inputs=module.get_inputs(),  # type: ignore[operator]
                 compile_spec=common.get_tosa_compile_spec("TOSA-0.80+MI"),
             )
             .export()
@@ -61,7 +61,7 @@ def _tosa_BI_pipeline(self, module: torch.nn.Module, dump_file=None):
         (
             ArmTester(
                 module,
-                example_inputs=module.get_inputs(),
+                example_inputs=module.get_inputs(),  # type: ignore[operator]
                 compile_spec=common.get_tosa_compile_spec("TOSA-0.80+BI"),
             )
             .quantize()
diff --git a/backends/arm/test/misc/test_lifted_tensor.py b/backends/arm/test/misc/test_lifted_tensor.py
index a16b1e639b..092483fd63 100644
--- a/backends/arm/test/misc/test_lifted_tensor.py
+++ b/backends/arm/test/misc/test_lifted_tensor.py
@@ -1,4 +1,4 @@
-# Copyright 2024 Arm Limited and/or its affiliates.
+# Copyright 2024-2025 Arm Limited and/or its affiliates.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
@@ -10,7 +10,7 @@
 import torch
 from executorch.backends.arm.test import common
 from executorch.backends.arm.test.tester.arm_tester import ArmTester
-from parameterized import parameterized
+from parameterized import parameterized  # type: ignore[import-untyped]
 
 
 class LiftedTensor(torch.nn.Module):
@@ -23,14 +23,14 @@ class LiftedTensor(torch.nn.Module):
         (operator.sub, (torch.rand(2, 2), 2)),
     ]
 
-    def __init__(self, op: callable):
+    def __init__(self, op: callable):  # type: ignore[valid-type]
         super().__init__()
         self.op = op
         self.lifted_tensor = torch.Tensor([[1, 2], [3, 4]])
 
     def forward(self, x: torch.Tensor, length) -> torch.Tensor:
         sliced = self.lifted_tensor[:, :length]
-        return self.op(sliced, x)
+        return self.op(sliced, x)  # type: ignore[misc]
 
 
 class LiftedScalarTensor(torch.nn.Module):
@@ -42,13 +42,13 @@ class LiftedScalarTensor(torch.nn.Module):
         (operator.sub, (torch.randn(3),), 1.0),
     ]
 
-    def __init__(self, op: callable, arg1: Union[int, float, torch.tensor]):
+    def __init__(self, op: callable, arg1: Union[int, float, torch.tensor]):  # type: ignore[valid-type]
         super().__init__()
         self.op = op
         self.arg1 = arg1
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
-        return self.op(x, self.arg1)
+        return self.op(x, self.arg1)  # type: ignore[misc]
 
 
 class TestLiftedTensor(unittest.TestCase):
diff --git a/backends/arm/test/misc/test_tosa_spec.py b/backends/arm/test/misc/test_tosa_spec.py
index 77b10cf315..d61b3fe718 100644
--- a/backends/arm/test/misc/test_tosa_spec.py
+++ b/backends/arm/test/misc/test_tosa_spec.py
@@ -1,4 +1,4 @@
-# Copyright 2024 Arm Limited and/or its affiliates.
+# Copyright 2024-2025 Arm Limited and/or its affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the BSD-style license found in the
@@ -13,7 +13,7 @@
 )
 
 from executorch.exir.backend.compile_spec_schema import CompileSpec
-from parameterized import parameterized
+from parameterized import parameterized  # type: ignore[import-untyped]
 
 test_valid_0_80_strings = [
     "TOSA-0.80+BI",
@@ -64,13 +64,13 @@
 class TestTosaSpecification(unittest.TestCase):
     """Tests the TOSA specification class"""
 
-    @parameterized.expand(test_valid_0_80_strings)
+    @parameterized.expand(test_valid_0_80_strings)  # type: ignore[misc]
     def test_version_string_0_80(self, version_string: str):
         tosa_spec = TosaSpecification.create_from_string(version_string)
         assert isinstance(tosa_spec, Tosa_0_80)
         assert tosa_spec.profile in ["BI", "MI"]
 
-    @parameterized.expand(test_valid_1_00_strings)
+    @parameterized.expand(test_valid_1_00_strings)  # type: ignore[misc]
     def test_version_string_1_00(self, version_string: str):
         tosa_spec = TosaSpecification.create_from_string(version_string)
         assert isinstance(tosa_spec, Tosa_1_00)
@@ -83,7 +83,7 @@ def test_version_string_1_00(self, version_string: str):
                 e in test_valid_1_00_extensions[profile] for e in tosa_spec.extensions
             ]
 
-    @parameterized.expand(test_invalid_strings)
+    @parameterized.expand(test_invalid_strings)  # type: ignore[misc]
     def test_invalid_version_strings(self, version_string: str):
         tosa_spec = None
         with self.assertRaises(ValueError):
@@ -91,12 +91,12 @@ def test_invalid_version_strings(self, version_string: str):
 
         assert tosa_spec is None
 
-    @parameterized.expand(test_compile_specs)
+    @parameterized.expand(test_compile_specs)  # type: ignore[misc]
     def test_create_from_compilespec(self, compile_specs: list[CompileSpec]):
         tosa_spec = TosaSpecification.create_from_compilespecs(compile_specs)
         assert isinstance(tosa_spec, TosaSpecification)
 
-    @parameterized.expand(test_compile_specs_no_version)
+    @parameterized.expand(test_compile_specs_no_version)  # type: ignore[misc]
     def test_create_from_invalid_compilespec(self, compile_specs: list[CompileSpec]):
         tosa_spec = None
         with self.assertRaises(ValueError):
diff --git a/backends/arm/test/models/test_conformer.py b/backends/arm/test/models/test_conformer.py
new file mode 100644
index 0000000000..e3be7811dd
--- /dev/null
+++ b/backends/arm/test/models/test_conformer.py
@@ -0,0 +1,126 @@
+# Copyright 2025 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+import unittest
+
+import torch
+from executorch.backends.arm.test import common, conftest
+
+from executorch.backends.arm.test.tester.arm_tester import ArmTester
+
+from torchaudio.models import Conformer
+
+
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+
+
+class TestConformer(unittest.TestCase):
+    """Tests Torchaudio Conformer"""
+
+    # Adjust nbr below as we increase op support. Note: most of the delegates
+    # calls are directly consecutive to each other in the .pte. The reason
+    # for that is some assert ops are removed by passes in the
+    # .to_executorch step, i.e. after Arm partitioner.
+    ops_after_partitioner = {
+        "executorch_exir_dialects_edge__ops_aten_arange_start_step": 1,
+        "executorch_exir_dialects_edge__ops_aten_full_like_default": 4,
+        "executorch_exir_dialects_edge__ops_aten_max_default": 1,
+        "executorch_exir_dialects_edge__ops_aten_mul_Scalar": 4,
+        "executorch_exir_dialects_edge__ops_aten_eq_Scalar": 2,
+        "executorch_exir_dialects_edge__ops_aten_where_self": 4,
+        "executorch_exir_dialects_edge__ops_aten_logical_not_default": 4,
+        "executorch_exir_dialects_edge__ops_aten_any_dim": 2,
+        "torch.ops.aten._assert_scalar.default": 10,
+        "torch.ops.aten._local_scalar_dense.default": 1,
+        "torch.ops.aten.scalar_tensor.default": 2,
+        "torch.ops.higher_order.executorch_call_delegate": 5,
+    }
+
+    dim = 16
+    lengths = torch.randint(1, 100, (10,), dtype=torch.int32)
+    input_data = torch.rand(10, int(lengths.max()), dim)
+    conformer = Conformer(
+        input_dim=dim,
+        num_heads=4,
+        ffn_dim=64,
+        num_layers=2,
+        depthwise_conv_kernel_size=31,
+    )
+    conformer = conformer.eval()
+
+    def test_conformer_tosa_MI(self):
+        (
+            ArmTester(
+                self.conformer,
+                example_inputs=(self.input_data, self.lengths),
+                compile_spec=common.get_tosa_compile_spec(tosa_spec="TOSA-0.80+MI"),
+            )
+            .export()
+            .to_edge_transform_and_lower()
+            .dump_operator_distribution()
+            .check_count(self.ops_after_partitioner)
+            .to_executorch()
+            # TODO(MLETORCH-632): Fix numerical errors
+            .run_method_and_compare_outputs(
+                inputs=(self.input_data, self.lengths), rtol=1, atol=5
+            )
+        )
+
+    @unittest.expectedFailure  # TODO(MLETORCH-635)
+    def test_conformer_tosa_BI(self):
+        (
+            ArmTester(
+                self.conformer,
+                example_inputs=(self.input_data, self.lengths),
+                compile_spec=common.get_tosa_compile_spec(tosa_spec="TOSA-0.80+BI"),
+            )
+            .quantize()
+            .export()
+            .to_edge_transform_and_lower()
+            .to_executorch()
+            .run_method_and_compare_outputs(
+                qtol=1, rtol=1, atol=5, inputs=(self.input_data, self.lengths)
+            )
+        )
+
+    @unittest.expectedFailure  # TODO(MLETORCH-635)
+    def test_conformer_u55_BI(self):
+        tester = (
+            ArmTester(
+                self.conformer,
+                example_inputs=(self.input_data, self.lengths),
+                compile_spec=common.get_u55_compile_spec(),
+            )
+            .quantize()
+            .export()
+            .to_edge_transform_and_lower()
+            .to_executorch()
+            .serialize()
+        )
+        if conftest.is_option_enabled("corstone_fvp"):
+            tester.run_method_and_compare_outputs(
+                atol=1.0, qtol=1, inputs=(self.input_data, self.lengths)
+            )
+
+    @unittest.expectedFailure  # TODO(MLETORCH-635)
+    def test_conformer_u85_BI(self):
+        tester = (
+            ArmTester(
+                self.conformer,
+                example_inputs=(self.input_data, self.lengths),
+                compile_spec=common.get_u85_compile_spec(),
+            )
+            .quantize()
+            .export()
+            .to_edge_transform_and_lower()
+            .to_executorch()
+            .serialize()
+        )
+        if conftest.is_option_enabled("corstone_fvp"):
+            tester.run_method_and_compare_outputs(
+                atol=1.0, qtol=1, inputs=(self.input_data, self.lengths)
+            )
diff --git a/backends/arm/test/models/test_mobilenet_v2_arm.py b/backends/arm/test/models/test_mobilenet_v2_arm.py
index 21bd43202d..62b14a1022 100644
--- a/backends/arm/test/models/test_mobilenet_v2_arm.py
+++ b/backends/arm/test/models/test_mobilenet_v2_arm.py
@@ -14,8 +14,10 @@
 from executorch.backends.arm.test import common, conftest
 
 from executorch.backends.arm.test.tester.arm_tester import ArmTester
-from torchvision import models, transforms
-from torchvision.models.mobilenetv2 import MobileNet_V2_Weights
+from torchvision import models, transforms  # type: ignore[import-untyped]
+from torchvision.models.mobilenetv2 import (  # type: ignore[import-untyped]
+    MobileNet_V2_Weights,
+)
 
 
 logger = logging.getLogger(__name__)
diff --git a/backends/arm/test/ops/test_add.py b/backends/arm/test/ops/test_add.py
index db6fde53ae..b4b43f88c7 100644
--- a/backends/arm/test/ops/test_add.py
+++ b/backends/arm/test/ops/test_add.py
@@ -5,169 +5,143 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-import unittest
 
 from typing import Tuple
 
-import pytest
 import torch
-from executorch.backends.arm.test import common, conftest
-from executorch.backends.arm.test.tester.arm_tester import ArmTester
-from executorch.exir.backend.compile_spec_schema import CompileSpec
-from parameterized import parameterized
-
-
-class TestSimpleAdd(unittest.TestCase):
-    """Tests a single add op, x+x and x+y."""
-
-    class Add(torch.nn.Module):
-        test_parameters = [
-            (torch.FloatTensor([1, 2, 3, 5, 7]),),
-            (3 * torch.ones(8),),
-            (10 * torch.randn(8),),
-            (torch.ones(1, 1, 4, 4),),
-            (torch.ones(1, 3, 4, 2),),
-        ]
-
-        def forward(self, x):
-            return x + x
-
-    class Add2(torch.nn.Module):
-        test_parameters = [
-            (
-                torch.FloatTensor([1, 2, 3, 5, 7]),
-                (torch.FloatTensor([2, 1, 2, 1, 10])),
-            ),
-            (torch.ones(1, 10, 4, 6), torch.ones(1, 10, 4, 6)),
-            (torch.randn(1, 1, 4, 4), torch.ones(1, 1, 4, 1)),
-            (torch.randn(1, 3, 4, 4), torch.randn(1, 3, 4, 4)),
-            (10000 * torch.randn(1, 1, 4, 4), torch.randn(1, 1, 4, 1)),
-        ]
-
-        def __init__(self):
-            super().__init__()
-
-        def forward(self, x, y):
-            return x + y
-
-    def _test_add_tosa_MI_pipeline(
-        self, module: torch.nn.Module, test_data: Tuple[torch.Tensor]
-    ):
-        (
-            ArmTester(
-                module,
-                example_inputs=test_data,
-                compile_spec=common.get_tosa_compile_spec("TOSA-0.80+MI"),
-            )
-            .export()
-            .check_count({"torch.ops.aten.add.Tensor": 1})
-            .check_not(["torch.ops.quantized_decomposed"])
-            .to_edge()
-            .partition()
-            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
-            .to_executorch()
-            .run_method_and_compare_outputs(inputs=test_data)
-        )
-
-    def _test_add_tosa_BI_pipeline(
-        self, module: torch.nn.Module, test_data: Tuple[torch.Tensor]
-    ):
-        (
-            ArmTester(
-                module,
-                example_inputs=test_data,
-                compile_spec=common.get_tosa_compile_spec("TOSA-0.80+BI"),
-            )
-            .quantize()
-            .export()
-            .check_count({"torch.ops.aten.add.Tensor": 1})
-            .check(["torch.ops.quantized_decomposed"])
-            .to_edge()
-            .partition()
-            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
-            .to_executorch()
-            .run_method_and_compare_outputs(inputs=test_data, qtol=1)
-        )
-
-    def _test_add_ethos_BI_pipeline(
-        self,
-        module: torch.nn.Module,
-        compile_spec: CompileSpec,
-        test_data: Tuple[torch.Tensor],
-    ):
-        tester = (
-            ArmTester(
-                module,
-                example_inputs=test_data,
-                compile_spec=compile_spec,
-            )
-            .quantize()
-            .export()
-            .check_count({"torch.ops.aten.add.Tensor": 1})
-            .check(["torch.ops.quantized_decomposed"])
-            .to_edge()
-            .partition()
-            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
-            .to_executorch()
-            .serialize()
-        )
-        if conftest.is_option_enabled("corstone_fvp"):
-            tester.run_method_and_compare_outputs(qtol=1, inputs=test_data)
-
-        return tester
-
-    @parameterized.expand(Add.test_parameters)
-    def test_add_tosa_MI(self, test_data: torch.Tensor):
-        test_data = (test_data,)
-        self._test_add_tosa_MI_pipeline(self.Add(), test_data)
-
-    @parameterized.expand(Add.test_parameters)
-    def test_add_tosa_BI(self, test_data: torch.Tensor):
-        test_data = (test_data,)
-        self._test_add_tosa_BI_pipeline(self.Add(), test_data)
-
-    @parameterized.expand(Add.test_parameters)
-    @pytest.mark.corstone_fvp
-    def test_add_u55_BI(self, test_data: torch.Tensor):
-        test_data = (test_data,)
-        self._test_add_ethos_BI_pipeline(
-            self.Add(),
-            common.get_u55_compile_spec(),
-            test_data,
-        )
-
-    @parameterized.expand(Add.test_parameters)
-    @pytest.mark.corstone_fvp
-    def test_add_u85_BI(self, test_data: torch.Tensor):
-        test_data = (test_data,)
-        self._test_add_ethos_BI_pipeline(
-            self.Add(),
-            common.get_u85_compile_spec(),
-            test_data,
-        )
-
-    @parameterized.expand(Add2.test_parameters)
-    def test_add2_tosa_MI(self, operand1: torch.Tensor, operand2: torch.Tensor):
-        test_data = (operand1, operand2)
-        self._test_add_tosa_MI_pipeline(self.Add2(), test_data)
-
-    @parameterized.expand(Add2.test_parameters)
-    def test_add2_tosa_BI(self, operand1: torch.Tensor, operand2: torch.Tensor):
-        test_data = (operand1, operand2)
-        self._test_add_tosa_BI_pipeline(self.Add2(), test_data)
-
-    @parameterized.expand(Add2.test_parameters)
-    @pytest.mark.corstone_fvp
-    def test_add2_u55_BI(self, operand1: torch.Tensor, operand2: torch.Tensor):
-        test_data = (operand1, operand2)
-        self._test_add_ethos_BI_pipeline(
-            self.Add2(), common.get_u55_compile_spec(), test_data
-        )
-
-    @parameterized.expand(Add2.test_parameters)
-    @pytest.mark.corstone_fvp
-    def test_add2_u85_BI(self, operand1: torch.Tensor, operand2: torch.Tensor):
-        test_data = (operand1, operand2)
-        self._test_add_ethos_BI_pipeline(
-            self.Add2(), common.get_u85_compile_spec(), test_data
-        )
+from executorch.backends.arm.test import common
+from executorch.backends.arm.test.tester.test_pipeline import (
+    EthosU55PipelineBI,
+    EthosU85PipelineBI,
+    TosaPipelineBI,
+    TosaPipelineMI,
+)
+
+aten_op = "torch.ops.aten.add.Tensor"
+exir_op = "executorch_exir_dialects_edge__ops_aten_add_Tensor"
+
+input_t1 = Tuple[torch.Tensor]  # Input x
+
+
+class Add(torch.nn.Module):
+    def forward(self, x: torch.Tensor):
+        return x + x
+
+    test_data: list[input_t1] = {
+        "5d_float": (torch.FloatTensor([1, 2, 3, 5, 7]),),
+        "1d_ones": ((3 * torch.ones(8),)),
+        "1d_randn": (10 * torch.randn(8),),
+        "4d_ones_1": (torch.ones(1, 1, 4, 4),),
+        "4d_ones_2": (torch.ones(1, 3, 4, 2),),
+    }
+
+
+input_t2 = Tuple[torch.Tensor, torch.Tensor]  # Input x, y
+
+
+class Add2(torch.nn.Module):
+    def forward(self, x: torch.Tensor, y: torch.Tensor):
+        return x + y
+
+    test_data: list[input_t2] = {
+        "5d_float": (
+            torch.FloatTensor([1, 2, 3, 5, 7]),
+            (torch.FloatTensor([2, 1, 2, 1, 10])),
+        ),
+        "4d_ones": (torch.ones(1, 10, 4, 6), torch.ones(1, 10, 4, 6)),
+        "4d_randn_1": (torch.randn(1, 1, 4, 4), torch.ones(1, 1, 4, 1)),
+        "4d_randn_2": (torch.randn(1, 3, 4, 4), torch.randn(1, 3, 4, 4)),
+        "4d_randn_big": (10000 * torch.randn(1, 1, 4, 4), torch.randn(1, 1, 4, 1)),
+    }
+
+
+@common.parametrize("test_data", Add.test_data)
+def test_add_tosa_MI(test_data: input_t1):
+    pipeline = TosaPipelineMI[input_t1](Add(), test_data, aten_op, exir_op)
+    pipeline.run()
+
+
+@common.parametrize("test_data", Add.test_data)
+def test_add_tosa_BI(test_data: input_t1):
+    pipeline = TosaPipelineBI[input_t1](Add(), test_data, aten_op, exir_op)
+    pipeline.run()
+
+
+@common.parametrize("test_data", Add.test_data)
+def test_add_u55_BI(test_data: input_t1):
+    pipeline = EthosU55PipelineBI[input_t1](
+        Add(), test_data, aten_op, exir_op, run_on_fvp=False
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_data", Add.test_data)
+def test_add_u85_BI(test_data: input_t1):
+    pipeline = EthosU85PipelineBI[input_t1](
+        Add(), test_data, aten_op, exir_op, run_on_fvp=False
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_data", Add.test_data)
+@common.SkipIfNoCorstone300
+def test_add_u55_BI_on_fvp(test_data: input_t1):
+    pipeline = EthosU55PipelineBI[input_t1](
+        Add(), test_data, aten_op, exir_op, run_on_fvp=True
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_data", Add.test_data)
+@common.SkipIfNoCorstone320
+def test_add_u85_BI_on_fvp(test_data: input_t1):
+    pipeline = EthosU85PipelineBI[input_t1](
+        Add(), test_data, aten_op, exir_op, run_on_fvp=True
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_data", Add2.test_data)
+def test_add2_tosa_MI(test_data: input_t2):
+    pipeline = TosaPipelineMI[input_t2](Add2(), test_data, aten_op, exir_op)
+    pipeline.run()
+
+
+@common.parametrize("test_data", Add2.test_data)
+def test_add2_tosa_BI(test_data: input_t2):
+    pipeline = TosaPipelineBI[input_t2](Add2(), test_data, aten_op, exir_op)
+    pipeline.run()
+
+
+@common.parametrize("test_data", Add2.test_data)
+def test_add2_u55_BI(test_data: input_t2):
+    pipeline = EthosU55PipelineBI[input_t2](
+        Add2(), test_data, aten_op, exir_op, run_on_fvp=False
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_data", Add2.test_data)
+@common.SkipIfNoCorstone300
+def test_add2_u55_BI_on_fvp(test_data: input_t2):
+    pipeline = EthosU55PipelineBI[input_t2](
+        Add2(), test_data, aten_op, exir_op, run_on_fvp=True
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_data", Add2.test_data)
+def test_add2_u85_BI(test_data: input_t2):
+    pipeline = EthosU85PipelineBI[input_t2](
+        Add2(), test_data, aten_op, exir_op, run_on_fvp=False
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_data", Add2.test_data)
+@common.SkipIfNoCorstone320
+def test_add2_u85_BI_on_fvp(test_data: input_t2):
+    pipeline = EthosU85PipelineBI[input_t2](
+        Add2(), test_data, aten_op, exir_op, run_on_fvp=True
+    )
+    pipeline.run()
diff --git a/backends/arm/test/ops/test_bmm.py b/backends/arm/test/ops/test_bmm.py
index 06470d91e8..bd6e1ef689 100644
--- a/backends/arm/test/ops/test_bmm.py
+++ b/backends/arm/test/ops/test_bmm.py
@@ -6,7 +6,7 @@
 
 import unittest
 
-from typing import Tuple
+from typing import Callable, Tuple
 
 import pytest
 
@@ -16,39 +16,37 @@
 from executorch.exir.backend.compile_spec_schema import CompileSpec
 from parameterized import parameterized
 
-torch.manual_seed(1)
-
 
 class TestBMM(unittest.TestCase):
     """Tests Batch MatMul"""
 
     class BMM(torch.nn.Module):
-        test_parameters = [
-            (torch.rand(2, 1, 1), torch.rand(2, 1, 1)),
-            (torch.rand(5, 3, 5), torch.rand(5, 5, 2)),
-            (torch.ones(1, 55, 3), torch.ones(1, 3, 44)),
-            (10000 * torch.randn(10, 1, 10), torch.randn(10, 10, 5)),
-            (-10 * torch.randn(2, 32, 64), 5 + 5 * torch.randn(2, 64, 32)),
+        test_data_generators = [
+            lambda: (torch.rand(2, 1, 1), torch.rand(2, 1, 1)),
+            lambda: (torch.rand(5, 3, 5), torch.rand(5, 5, 2)),
+            lambda: (torch.ones(1, 55, 3), torch.ones(1, 3, 44)),
+            lambda: (10000 * torch.randn(10, 1, 10), torch.randn(10, 10, 5)),
+            lambda: (-10 * torch.randn(2, 32, 64), 5 + 5 * torch.randn(2, 64, 32)),
         ]
 
         def forward(self, x, y):
             return torch.bmm(x, y)
 
     class MatMul(torch.nn.Module):
-        test_parameters = [
-            (torch.rand(2, 3, 5), torch.rand(2, 5, 2)),
-            (torch.rand(1, 2, 3, 5), torch.rand(1, 2, 5, 2)),
+        test_data_generators = [
+            lambda: (torch.rand(2, 3, 5), torch.rand(2, 5, 2)),
+            lambda: (torch.rand(1, 2, 3, 5), torch.rand(1, 2, 5, 2)),
         ]
 
         def forward(self, x, y):
             return torch.matmul(x, y)
 
     class BMMSingleInput(torch.nn.Module):
-        test_parameters = [
-            (torch.rand(20, 3, 3),),
-            (torch.rand(2, 128, 128),),
-            (10000 * torch.randn(4, 25, 25),),
-            (5 + 5 * torch.randn(3, 64, 64),),
+        test_data_generators = [
+            lambda: (torch.rand(20, 3, 3),),
+            lambda: (torch.rand(2, 128, 128),),
+            lambda: (10000 * torch.randn(4, 25, 25),),
+            lambda: (5 + 5 * torch.randn(3, 64, 64),),
         ]
 
         def forward(self, x):
@@ -120,67 +118,69 @@ def _test_bmm_ethosu_BI_pipeline(
         if conftest.is_option_enabled("corstone_fvp"):
             tester.run_method_and_compare_outputs(inputs=test_data, qtol=1)
 
-    @parameterized.expand(BMM.test_parameters)
-    def test_bmm_tosa_MI(self, operand1: torch.Tensor, operand2: torch.Tensor):
-        test_data = (operand1, operand2)
+    @parameterized.expand(BMM.test_data_generators)
+    def test_bmm_tosa_MI(self, test_data_generator: Callable[[], Tuple]):
+        test_data = test_data_generator()
         self._test_bmm_tosa_MI_pipeline(self.BMM(), test_data)
 
-    @parameterized.expand(BMMSingleInput.test_parameters)
-    def test_bmm_single_input_tosa_MI(self, operand1: torch.Tensor):
-        test_data = (operand1,)
+    @parameterized.expand(BMMSingleInput.test_data_generators)
+    def test_bmm_single_input_tosa_MI(self, test_data_generator: Callable[[], Tuple]):
+        test_data = test_data_generator()
         self._test_bmm_tosa_MI_pipeline(self.BMMSingleInput(), test_data)
 
-    @parameterized.expand(MatMul.test_parameters)
-    def test_matmul_tosa_MI(self, operand1: torch.Tensor, operand2: torch.Tensor):
-        test_data = (operand1, operand2)
+    @parameterized.expand(MatMul.test_data_generators)
+    def test_matmul_tosa_MI(self, test_data_generator: Callable[[], Tuple]):
+        test_data = test_data_generator()
         self._test_bmm_tosa_MI_pipeline(self.MatMul(), test_data)
 
-    @parameterized.expand(MatMul.test_parameters)
-    def test_matmul_tosa_BI(self, operand1: torch.Tensor, operand2: torch.Tensor):
-        test_data = (operand1, operand2)
+    @parameterized.expand(MatMul.test_data_generators)
+    def test_matmul_tosa_BI(self, test_data_generator: Callable[[], Tuple]):
+        test_data = test_data_generator()
         self._test_bmm_tosa_BI_pipeline(self.MatMul(), test_data)
 
-    @parameterized.expand(BMM.test_parameters)
-    def test_bmm_tosa_BI(self, operand1: torch.Tensor, operand2: torch.Tensor):
-        test_data = (operand1, operand2)
+    @parameterized.expand(BMM.test_data_generators)
+    def test_bmm_tosa_BI(self, test_data_generator: Callable[[], Tuple]):
+        test_data = test_data_generator()
         self._test_bmm_tosa_BI_pipeline(self.BMM(), test_data)
 
-    @parameterized.expand(BMMSingleInput.test_parameters)
-    def test_bmm_single_input_tosa_BI(self, operand1: torch.Tensor):
-        test_data = (operand1,)
+    @parameterized.expand(BMMSingleInput.test_data_generators)
+    def test_bmm_single_input_tosa_BI(self, test_data_generator: Callable[[], Tuple]):
+        test_data = test_data_generator()
         self._test_bmm_tosa_BI_pipeline(self.BMMSingleInput(), test_data)
 
-    @parameterized.expand(BMM.test_parameters)
+    @parameterized.expand(BMM.test_data_generators)
     @pytest.mark.corstone_fvp
     @unittest.expectedFailure
-    def test_bmm_u55_BI_xfails(self, operand1: torch.Tensor, operand2: torch.Tensor):
-        test_data = (operand1, operand2)
+    def test_bmm_u55_BI_xfails(self, test_data_generator: Callable[[], Tuple]):
+        test_data = test_data_generator()
         self._test_bmm_ethosu_BI_pipeline(
             self.BMM(), common.get_u55_compile_spec(), test_data
         )
 
-    @parameterized.expand(BMM.test_parameters)
+    @parameterized.expand(BMM.test_data_generators)
     @pytest.mark.corstone_fvp
-    def test_bmm_u85_BI(self, operand1: torch.Tensor, operand2: torch.Tensor):
-        test_data = (operand1, operand2)
+    def test_bmm_u85_BI(self, test_data_generator: Callable[[], Tuple]):
+        test_data = test_data_generator()
         self._test_bmm_ethosu_BI_pipeline(
             self.BMM(), common.get_u85_compile_spec(), test_data
         )
 
     # Expected to fail with error: Warning, unsupported fusing of TOSA Rescale previous operator is of type: Memcpy
-    @parameterized.expand(BMMSingleInput.test_parameters)
+    @parameterized.expand(BMMSingleInput.test_data_generators)
     @pytest.mark.corstone_fvp
     @unittest.expectedFailure
-    def test_bmm_single_input_u55_BI_xfails(self, operand1: torch.Tensor):
-        test_data = (operand1,)
+    def test_bmm_single_input_u55_BI_xfails(
+        self, test_data_generator: Callable[[], Tuple]
+    ):
+        test_data = test_data_generator()
         self._test_bmm_ethosu_BI_pipeline(
             self.BMMSingleInput(), common.get_u55_compile_spec(), test_data
         )
 
-    @parameterized.expand(BMMSingleInput.test_parameters)
+    @parameterized.expand(BMMSingleInput.test_data_generators)
     @pytest.mark.corstone_fvp
-    def test_bmm_single_input_u85_BI(self, operand1: torch.Tensor):
-        test_data = (operand1,)
+    def test_bmm_single_input_u85_BI(self, test_data_generator: Callable[[], Tuple]):
+        test_data = test_data_generator()
         self._test_bmm_ethosu_BI_pipeline(
             self.BMMSingleInput(), common.get_u85_compile_spec(), test_data
         )
diff --git a/backends/arm/test/ops/test_cat.py b/backends/arm/test/ops/test_cat.py
index 115b4402f5..a1613d1d04 100644
--- a/backends/arm/test/ops/test_cat.py
+++ b/backends/arm/test/ops/test_cat.py
@@ -1,5 +1,5 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
-# Copyright 2024 Arm Limited and/or its affiliates.
+# Copyright 2024-2025 Arm Limited and/or its affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the BSD-style license found in the
@@ -33,6 +33,8 @@ class Cat(torch.nn.Module):
                 ),
                 -1,
             ),
+            ((torch.randn(1, 2, 4, 4), torch.randn(1, 2, 4, 1)), 3),
+            ((torch.randn(1, 2, 4, 4), torch.randn(1, 2, 4, 4)), 0),
             ((torch.randn(2, 2, 4, 4), torch.randn(2, 2, 4, 1)), 3),
             (
                 (
@@ -47,8 +49,8 @@ class Cat(torch.nn.Module):
         def __init__(self):
             super().__init__()
 
-        def forward(self, tensors: tuple[torch.Tensor, ...], dim: int) -> torch.Tensor:
-            return torch.cat(tensors, dim=dim)
+        def forward(self, t: tuple[torch.Tensor, ...], dim: int) -> torch.Tensor:
+            return torch.cat(t, dim=dim)
 
     def _test_cat_tosa_MI_pipeline(
         self, module: torch.nn.Module, test_data: Tuple[tuple[torch.Tensor, ...], int]
@@ -134,22 +136,38 @@ def test_cat_tosa_BI(self, operands: tuple[torch.Tensor, ...], dim: int):
         test_data = (operands, dim)
         self._test_cat_tosa_BI_pipeline(self.Cat(), test_data)
 
-    # Mismatch in provided number of inputs and model signature, MLETORCH 519
-    @parameterized.expand(Cat.test_parameters)
+    @parameterized.expand(Cat.test_parameters[:-3])
     @pytest.mark.corstone_fvp
-    @conftest.expectedFailureOnFVP
     def test_cat_u55_BI(self, operands: tuple[torch.Tensor, ...], dim: int):
         test_data = (operands, dim)
         self._test_cat_ethosu_BI_pipeline(
             self.Cat(), common.get_u55_compile_spec(), test_data
         )
 
-    # Mismatch in provided number of inputs and model signature, MLETORCH 519
-    @parameterized.expand(Cat.test_parameters)
+    # MLETORCH-630 Cat does not work on FVP with batch>1
+    @parameterized.expand(Cat.test_parameters[-3:])
     @pytest.mark.corstone_fvp
     @conftest.expectedFailureOnFVP
+    def test_cat_u55_BI_xfails(self, operands: tuple[torch.Tensor, ...], dim: int):
+        test_data = (operands, dim)
+        self._test_cat_ethosu_BI_pipeline(
+            self.Cat(), common.get_u55_compile_spec(), test_data
+        )
+
+    @parameterized.expand(Cat.test_parameters[:-3])
+    @pytest.mark.corstone_fvp
     def test_cat_u85_BI(self, operands: tuple[torch.Tensor, ...], dim: int):
         test_data = (operands, dim)
         self._test_cat_ethosu_BI_pipeline(
             self.Cat(), common.get_u85_compile_spec(), test_data
         )
+
+    # MLETORCH-630 Cat does not work on FVP with batch>1
+    @parameterized.expand(Cat.test_parameters[-3:])
+    @pytest.mark.corstone_fvp
+    @conftest.expectedFailureOnFVP
+    def test_cat_u85_BI_xfails(self, operands: tuple[torch.Tensor, ...], dim: int):
+        test_data = (operands, dim)
+        self._test_cat_ethosu_BI_pipeline(
+            self.Cat(), common.get_u85_compile_spec(), test_data
+        )
diff --git a/backends/arm/test/ops/test_clamp.py b/backends/arm/test/ops/test_clamp.py
new file mode 100644
index 0000000000..5cf333068c
--- /dev/null
+++ b/backends/arm/test/ops/test_clamp.py
@@ -0,0 +1,165 @@
+# Copyright 2025 Arm Limited and/or its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import unittest
+from numbers import Number
+from typing import Tuple, Union
+
+import pytest
+import torch
+
+from executorch.backends.arm.quantizer.arm_quantizer import (
+    ArmQuantizer,
+    get_symmetric_quantization_config,
+)
+from executorch.backends.arm.test import common, conftest
+from executorch.backends.arm.test.tester.arm_tester import ArmTester
+from executorch.backends.arm.tosa_specification import TosaSpecification
+from executorch.backends.xnnpack.test.tester.tester import Quantize
+from executorch.exir.backend.compile_spec_schema import CompileSpec
+from parameterized import parameterized
+
+
+test_data_suite = [
+    # (test_name, test_data, min, max)
+    ("rank_1", torch.rand(10) * 2, -1.0, 1.0),
+    ("rank_2", torch.rand(1, 35), 0.5, 0.8),
+    ("rank_3", torch.ones(1, 10, 10), -1, -1),
+    ("rank_4", torch.rand(1, 10, 10, 1) * 2, -0.1, 2.0),
+    ("rank_4_mixed_min_max_dtype", torch.rand(1, 10, 10, 5) + 10, 8.0, 10),
+    ("rank_4_no_min", torch.rand(1, 10, 10, 1) * 10, None, 5),
+    ("rank_4_no_max", torch.rand(1, 10, 10, 1) - 3, -3.3, None),
+]
+
+
+class TestClamp(unittest.TestCase):
+    """Tests Clamp Operator."""
+
+    class Clamp(torch.nn.Module):
+        def __init__(
+            self,
+            min: Union[torch.Tensor, Number, None],
+            max: Union[torch.Tensor, Number, None],
+        ):
+            super().__init__()
+
+            self.clamp_min = min
+            self.clamp_max = max
+
+        def forward(self, x):
+            return torch.clamp(x, self.clamp_min, self.clamp_max)
+
+    def _test_clamp_tosa_MI_pipeline(
+        self, module: torch.nn.Module, test_data: Tuple[torch.Tensor]
+    ):
+        (
+            ArmTester(
+                module,
+                example_inputs=test_data,
+                compile_spec=common.get_tosa_compile_spec("TOSA-0.80+MI"),
+            )
+            .export()
+            .check(["torch.ops.aten.clamp.default"])
+            .check_not(["torch.ops.quantized_decomposed"])
+            .to_edge_transform_and_lower()
+            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
+            .to_executorch()
+            .run_method_and_compare_outputs(inputs=test_data)
+        )
+
+    def _test_clamp_tosa_BI_pipeline(
+        self, module: torch.nn.Module, test_data: Tuple[torch.Tensor]
+    ):
+        tosa_spec = TosaSpecification.create_from_string("TOSA-0.80+BI")
+        compile_spec = common.get_tosa_compile_spec(tosa_spec)
+        quantizer = ArmQuantizer(tosa_spec).set_io(get_symmetric_quantization_config())
+        (
+            ArmTester(
+                module,
+                example_inputs=test_data,
+                compile_spec=compile_spec,
+            )
+            .quantize(Quantize(quantizer, get_symmetric_quantization_config()))
+            .export()
+            .check_count({"torch.ops.aten.clamp.default": 1})
+            .check(["torch.ops.quantized_decomposed"])
+            .to_edge_transform_and_lower()
+            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
+            .to_executorch()
+            .run_method_and_compare_outputs(inputs=test_data)
+        )
+
+    def _test_clamp_tosa_ethos_BI_pipeline(
+        self,
+        compile_spec: list[CompileSpec],
+        module: torch.nn.Module,
+        test_data: Tuple[torch.tensor],
+    ):
+        tosa_spec = TosaSpecification.create_from_compilespecs(compile_spec)
+        quantizer = ArmQuantizer(tosa_spec).set_io(get_symmetric_quantization_config())
+        tester = (
+            ArmTester(
+                module,
+                example_inputs=test_data,
+                compile_spec=compile_spec,
+            )
+            .quantize(Quantize(quantizer, get_symmetric_quantization_config()))
+            .export()
+            .check_count({"torch.ops.aten.clamp.default": 1})
+            .check(["torch.ops.quantized_decomposed"])
+            .to_edge_transform_and_lower()
+            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
+            .to_executorch()
+            .serialize()
+        )
+        if conftest.is_option_enabled("corstone_fvp"):
+            tester.run_method_and_compare_outputs(qtol=1, inputs=test_data)
+
+    @parameterized.expand(test_data_suite)
+    def test_clamp_tosa_MI(
+        self,
+        test_name: str,
+        test_data: torch.Tensor,
+        min: Union[torch.Tensor, Number, None],
+        max: Union[torch.Tensor, Number, None],
+    ):
+        self._test_clamp_tosa_MI_pipeline(self.Clamp(min, max), (test_data,))
+
+    @parameterized.expand(test_data_suite)
+    def test_clamp_tosa_BI(
+        self,
+        test_name: str,
+        test_data: torch.Tensor,
+        min: Union[torch.Tensor, Number, None],
+        max: Union[torch.Tensor, Number, None],
+    ):
+        self._test_clamp_tosa_BI_pipeline(self.Clamp(min, max), (test_data,))
+
+    @parameterized.expand(test_data_suite)
+    @pytest.mark.corstone_fvp
+    def test_clamp_tosa_u55_BI(
+        self,
+        test_name: str,
+        test_data: torch.Tensor,
+        min: Union[torch.Tensor, Number, None],
+        max: Union[torch.Tensor, Number, None],
+    ):
+        self._test_clamp_tosa_ethos_BI_pipeline(
+            common.get_u55_compile_spec(), self.Clamp(min, max), (test_data,)
+        )
+
+    @parameterized.expand(test_data_suite)
+    @pytest.mark.corstone_fvp
+    def test_clamp_tosa_u85_BI(
+        self,
+        test_name: str,
+        test_data: torch.Tensor,
+        min: Union[torch.Tensor, Number, None],
+        max: Union[torch.Tensor, Number, None],
+    ):
+        self._test_clamp_tosa_ethos_BI_pipeline(
+            common.get_u85_compile_spec(), self.Clamp(min, max), (test_data,)
+        )
diff --git a/backends/arm/test/ops/test_conv1d.py b/backends/arm/test/ops/test_conv1d.py
index 3e0dfa6c5c..92da09a5ef 100644
--- a/backends/arm/test/ops/test_conv1d.py
+++ b/backends/arm/test/ops/test_conv1d.py
@@ -6,7 +6,7 @@
 
 import unittest
 
-from typing import List, Optional, Tuple, Union
+from typing import List, Tuple, Union
 
 import pytest
 
@@ -25,7 +25,6 @@ class Conv1d(torch.nn.Module):
 
     def __init__(
         self,
-        inputs: Optional[torch.Tensor] = None,
         length=8,
         nbr_conv=1,  # Number of chained convs
         in_channels: Union[List, int, None] = None,
@@ -75,11 +74,10 @@ def __init__(
         if not isinstance(padding_mode, List):
             padding_mode = [padding_mode]
 
-        # Generate test data if not provided
-        if inputs is None:
-            self.inputs = (torch.randn(batches, in_channels[0], length).to(dtype),)
-        else:
-            self.inputs = (inputs,)
+        self.batches = batches
+        self.in_channels = in_channels
+        self.length = length
+        self.dtype = dtype
 
         # Build chain of convs
         for i in range(self.nbr_convs):
@@ -100,7 +98,9 @@ def __init__(
             )
 
     def get_inputs(self):
-        return self.inputs
+        return (
+            torch.randn(self.batches, self.in_channels[0], self.length).to(self.dtype),
+        )
 
     def forward(self, x):
         for i in range(self.nbr_convs):
diff --git a/backends/arm/test/ops/test_conv2d.py b/backends/arm/test/ops/test_conv2d.py
index b80228c6f2..878c65757f 100644
--- a/backends/arm/test/ops/test_conv2d.py
+++ b/backends/arm/test/ops/test_conv2d.py
@@ -4,17 +4,20 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-import unittest
 
-from typing import List, Optional, Tuple, Union
-
-import pytest
+from typing import List, Tuple, Union
 
 import torch
-from executorch.backends.arm.test import common, conftest
-from executorch.backends.arm.test.tester.arm_tester import ArmTester
-from executorch.exir.backend.compile_spec_schema import CompileSpec
-from parameterized import parameterized
+from executorch.backends.arm.test import common
+from executorch.backends.arm.test.tester.test_pipeline import (
+    EthosU55PipelineBI,
+    EthosU85PipelineBI,
+    TosaPipelineBI,
+    TosaPipelineMI,
+)
+
+aten_op = "torch.ops.aten.conv2d.default"
+exir_op = "executorch_exir_dialects_edge__ops_aten_convolution_default"
 
 
 class Conv2d(torch.nn.Module):
@@ -25,7 +28,6 @@ class Conv2d(torch.nn.Module):
 
     def __init__(
         self,
-        inputs: Optional[torch.Tensor] = None,
         height=8,
         width=8,
         nbr_conv=1,  # Number of chained convs
@@ -76,13 +78,11 @@ def __init__(
         if not isinstance(padding_mode, List):
             padding_mode = [padding_mode]
 
-        # Generate test data if not provided
-        if inputs is None:
-            self.inputs = (
-                torch.randn(batches, in_channels[0], height, width).to(dtype),
-            )
-        else:
-            self.inputs = (inputs,)
+        self.batches = batches
+        self.in_channels = in_channels
+        self.height = height
+        self.width = width
+        self.dtype = dtype
 
         # Build chain of convs
         for i in range(self.nbr_convs):
@@ -103,7 +103,11 @@ def __init__(
             )
 
     def get_inputs(self):
-        return self.inputs
+        return (
+            torch.randn(self.batches, self.in_channels[0], self.height, self.width).to(
+                self.dtype
+            ),
+        )
 
     def forward(self, x):
         for i in range(self.nbr_convs):
@@ -325,124 +329,80 @@ def forward(self, x):
 
 # Shenanigan to get a nicer output when test fails. With unittest it looks like:
 # FAIL: test_conv2d_tosa_BI_2_3x3_1x3x12x12_st2_pd1
-testsuite = [
-    ("2x2_3x2x40x40_nobias", conv2d_2x2_3x2x40x40_nobias),
-    ("3x3_1x3x256x256_st1", conv2d_3x3_1x3x256x256_st1),
-    ("3x3_1x3x12x12_st2_pd1", conv2d_3x3_1x3x12x12_st2_pd1),
-    ("1x1_1x2x128x128_st1", conv2d_1x1_1x2x128x128_st1),
-    ("2x2_1x1x14x13_st2_needs_adjust_pass", conv2d_2x2_1x1x14x13_st2),
-    ("5x5_1x3x14x15_st3_pd1_needs_adjust_pass", conv2d_5x5_1x3x14x15_st3_pd1),
-    ("7x7_1x3x16x16_st2_pd1_dl2_needs_adjust_pass", conv2d_7x7_1x3x16x16_st2_pd1_dl2),
-    ("7x7_1x3x15x15_st1_pd0_dl1_needs_adjust_pass", conv2d_7x7_1x3x15x15_st1_pd0_dl1),
-    ("5x5_1x3x14x14_st5_pd0_dl1_needs_adjust_pass", conv2d_5x5_1x3x14x14_st5_pd0_dl1),
-    ("5x5_1x3x9x9_st5_pd0_dl1_needs_adjust_pass", conv2d_5x5_1x3x9x9_st5_pd0_dl1),
-    ("3x3_1x3x9x8_st3_pd0_dl1_needs_adjust_pass", conv2d_3x3_1x3x9x8_st3_pd0_dl1),
-    ("3x3_1x3x8x9_st3_pd0_dl1_needs_adjust_pass", conv2d_3x3_1x3x8x9_st3_pd0_dl1),
-    ("3x4_1x3x7x7_st3_pd0_dl1_needs_adjust_pass", conv2d_3x4_1x3x7x7_st3_pd0_dl1),
-    ("4x3_1x3x7x7_st3_pd0_dl1_needs_adjust_pass", conv2d_4x3_1x3x7x7_st3_pd0_dl1),
-    ("5x5_3x2x128x128_st1", conv2d_5x5_3x2x128x128_st1),
-    ("3x3_1x3x224x224_st2_pd1", conv2d_3x3_1x3x224x224_st2_pd1),
-    ("two_conv2d_nobias", two_conv2d_nobias),
-    ("two_conv2d", two_conv2d),
-]
-
-
-class TestConv2D(unittest.TestCase):
-    """Tests Conv2D, both single ops and multiple Convolutions in series."""
-
-    def _test_conv2d_tosa_MI_pipeline(
-        self, module: torch.nn.Module, test_data: Tuple[torch.Tensor]
-    ):
-        (
-            ArmTester(
-                module,
-                example_inputs=test_data,
-                compile_spec=common.get_tosa_compile_spec(
-                    "TOSA-0.80+MI",
-                ),
-            )
-            .export()
-            .to_edge()
-            .partition()
-            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
-            .check_not(["executorch_exir_dialects_edge__ops_aten_convolution_default"])
-            .to_executorch()
-            .run_method_and_compare_outputs(inputs=test_data)
-        )
-
-    def _test_conv2d_tosa_BI_pipeline(
-        self,
-        module: torch.nn.Module,
-        test_data: Tuple[torch.Tensor],
-    ):
-        (
-            ArmTester(
-                module,
-                example_inputs=test_data,
-                compile_spec=common.get_tosa_compile_spec(
-                    "TOSA-0.80+BI",
-                ),
-            )
-            .quantize()
-            .export()
-            .to_edge()
-            .partition()
-            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
-            .check_not(["executorch_exir_dialects_edge__ops_aten_convolution_default"])
-            .to_executorch()
-            .run_method_and_compare_outputs(inputs=test_data, qtol=1)
-        )
-
-    def _test_conv2d_ethosu_BI_pipeline(
-        self,
-        compile_spec: CompileSpec,
-        module: torch.nn.Module,
-        test_data: Tuple[torch.Tensor],
-    ):
-        tester = (
-            ArmTester(
-                module,
-                example_inputs=test_data,
-                compile_spec=compile_spec,
-            )
-            .quantize()
-            .export()
-            .to_edge()
-            .partition()
-            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
-            .check_not(["executorch_exir_dialects_edge__ops_aten_convolution_default"])
-            .to_executorch()
-            .serialize()
-        )
-        if conftest.is_option_enabled("corstone_fvp"):
-            tester.run_method_and_compare_outputs(qtol=1, inputs=test_data)
-
-    @parameterized.expand(testsuite)
-    def test_conv2d_tosa_MI(self, test_name, model):
-        self._test_conv2d_tosa_MI_pipeline(model, model.get_inputs())
-
-    @parameterized.expand(testsuite)
-    def test_conv2d_tosa_BI(self, test_name, model):
-        self._test_conv2d_tosa_BI_pipeline(model, model.get_inputs())
-
-    # These cases have numerical issues on FVP, MLETORCH-520
-    testsuite.remove(("2x2_3x2x40x40_nobias", conv2d_2x2_3x2x40x40_nobias))
-    testsuite.remove(("5x5_3x2x128x128_st1", conv2d_5x5_3x2x128x128_st1))
-
-    @parameterized.expand(testsuite)
-    @pytest.mark.corstone_fvp
-    def test_conv2d_u55_BI(self, test_name, model):
-        self._test_conv2d_ethosu_BI_pipeline(
-            common.get_u55_compile_spec(),
-            model,
-            model.get_inputs(),
-        )
-
-    @parameterized.expand(testsuite)
-    @pytest.mark.corstone_fvp
-    def test_conv2d_u85_BI(self, test_name, model):
-        self._test_conv2d_ethosu_BI_pipeline(
-            common.get_u85_compile_spec(),
-            model,
-            model.get_inputs(),
-        )
+test_modules = {
+    "2x2_3x2x40x40_nobias": conv2d_2x2_3x2x40x40_nobias,
+    "3x3_1x3x256x256_st1": conv2d_3x3_1x3x256x256_st1,
+    "3x3_1x3x12x12_st2_pd1": conv2d_3x3_1x3x12x12_st2_pd1,
+    "1x1_1x2x128x128_st1": conv2d_1x1_1x2x128x128_st1,
+    "2x2_1x1x14x13_st2_needs_adjust_pass": conv2d_2x2_1x1x14x13_st2,
+    "5x5_1x3x14x15_st3_pd1_needs_adjust_pass": conv2d_5x5_1x3x14x15_st3_pd1,
+    "7x7_1x3x16x16_st2_pd1_dl2_needs_adjust_pass": conv2d_7x7_1x3x16x16_st2_pd1_dl2,
+    "7x7_1x3x15x15_st1_pd0_dl1_needs_adjust_pass": conv2d_7x7_1x3x15x15_st1_pd0_dl1,
+    "5x5_1x3x14x14_st5_pd0_dl1_needs_adjust_pass": conv2d_5x5_1x3x14x14_st5_pd0_dl1,
+    "5x5_1x3x9x9_st5_pd0_dl1_needs_adjust_pass": conv2d_5x5_1x3x9x9_st5_pd0_dl1,
+    "3x3_1x3x9x8_st3_pd0_dl1_needs_adjust_pass": conv2d_3x3_1x3x9x8_st3_pd0_dl1,
+    "3x3_1x3x8x9_st3_pd0_dl1_needs_adjust_pass": conv2d_3x3_1x3x8x9_st3_pd0_dl1,
+    "3x4_1x3x7x7_st3_pd0_dl1_needs_adjust_pass": conv2d_3x4_1x3x7x7_st3_pd0_dl1,
+    "4x3_1x3x7x7_st3_pd0_dl1_needs_adjust_pass": conv2d_4x3_1x3x7x7_st3_pd0_dl1,
+    "5x5_3x2x128x128_st1": conv2d_5x5_3x2x128x128_st1,
+    "3x3_1x3x224x224_st2_pd1": conv2d_3x3_1x3x224x224_st2_pd1,
+    "two_conv2d_nobias": two_conv2d_nobias,
+    "two_conv2d": two_conv2d,
+}
+
+fvp_xfails = {
+    "2x2_3x2x40x40_nobias": "MLETORCH-520: Numerical issues on FVP.",
+    "5x5_3x2x128x128_st1": "MLETORCH-520: Numerical issues on FVP.",
+}
+input_t = Tuple[torch.Tensor]
+
+
+@common.parametrize("test_module", test_modules)
+def test_conv2d_tosa_MI(test_module):
+    pipeline = TosaPipelineMI[input_t](
+        test_module, test_module.get_inputs(), aten_op, exir_op
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_module", test_modules)
+def test_conv2d_tosa_BI(test_module):
+    pipeline = TosaPipelineBI[input_t](
+        test_module, test_module.get_inputs(), aten_op, exir_op
+    )
+    pipeline.change_args("run_method_and_compare_outputs", qtol=1)
+    pipeline.run()
+
+
+@common.parametrize("test_module", test_modules)
+def test_conv2d_u55_BI(test_module):
+    pipeline = EthosU55PipelineBI[input_t](
+        test_module, test_module.get_inputs(), aten_op, exir_op, run_on_fvp=False
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_module", test_modules)
+def test_conv2d_u85_BI(test_module):
+    pipeline = EthosU85PipelineBI[input_t](
+        test_module, test_module.get_inputs(), aten_op, exir_op, run_on_fvp=False
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_module", test_modules, fvp_xfails)
+@common.SkipIfNoCorstone300
+def test_conv2d_u55_BI_on_fvp(test_module):
+    pipeline = EthosU55PipelineBI[input_t](
+        test_module, test_module.get_inputs(), aten_op, exir_op, run_on_fvp=True
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_module", test_modules, fvp_xfails)
+@common.SkipIfNoCorstone320
+def test_conv2d_u85_BI_on_fvp(test_module):
+    pipeline = EthosU85PipelineBI[input_t](
+        test_module, test_module.get_inputs(), aten_op, exir_op, run_on_fvp=True
+    )
+    pipeline.run()
diff --git a/backends/arm/test/ops/test_conv_combos.py b/backends/arm/test/ops/test_conv_combos.py
index 8352727a1c..f6e13a2222 100644
--- a/backends/arm/test/ops/test_conv_combos.py
+++ b/backends/arm/test/ops/test_conv_combos.py
@@ -16,6 +16,7 @@
 from executorch.backends.arm.test.tester.arm_tester import ArmTester
 from executorch.exir.backend.backend_details import CompileSpec
 from parameterized import parameterized
+from torch.nn.parameter import Parameter
 
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.INFO)
@@ -112,12 +113,16 @@ class ComboConvBatchnormRelu6(torch.nn.Module):
         "executorch_exir_dialects_edge__ops_aten_hardtanh_default",
     ]
 
-    def __init__(self):
+    def __init__(self, affine: bool):
         super().__init__()
         self.conv2d = torch.nn.Conv2d(
             in_channels=3, out_channels=3, kernel_size=3, stride=1, groups=1
         )
-        self.batch_norm2d = torch.nn.BatchNorm2d(3, affine=False)
+        self.batch_norm2d = torch.nn.BatchNorm2d(3, affine=affine)
+        self.batch_norm2d.running_mean = torch.rand(3)
+        self.batch_norm2d.running_var = torch.rand(3)
+        self.batch_norm2d.weight = Parameter(torch.rand(3))
+        self.batch_norm2d.bias = Parameter(torch.rand(3))
         self.relu6 = torch.nn.ReLU6()
 
     def get_inputs(self) -> Tuple[torch.Tensor]:
@@ -289,24 +294,30 @@ def test_conv_meandim_u85_BI(self):
     ##############################
     ## Conv + batch norm + relu ##
     ##############################
-    def test_conv_batchnorm_relu6_tosa_MI(self):
-        model = ComboConvBatchnormRelu6()
+    affine_params = [("affine", True), ("_no_affine", False)]
+
+    @parameterized.expand(affine_params)
+    def test_conv_batchnorm_relu6_tosa_MI(self, test_suffix, affine):
+        model = ComboConvBatchnormRelu6(affine)
         self._test_conv_combo_tosa_MI_pipeline(model, model.get_inputs())
 
-    def test_conv_batchnorm_relu6_tosa_BI(self):
-        model = ComboConvBatchnormRelu6()
+    @parameterized.expand(affine_params)
+    def test_conv_batchnorm_relu6_tosa_BI(self, test_suffix, affine):
+        model = ComboConvBatchnormRelu6(affine)
         self._test_conv_combo_tosa_BI_pipeline(model, model.get_inputs())
 
+    @parameterized.expand(affine_params)
     @pytest.mark.corstone_fvp
-    def test_conv_batchnorm_relu6_u55_BI(self):
-        model = ComboConvBatchnormRelu6()
+    def test_conv_batchnorm_relu6_u55_BI(self, test_suffix, affine):
+        model = ComboConvBatchnormRelu6(affine)
         self._test_conv_combo_ethos_BI_pipeline(
             model, common.get_u55_compile_spec(), model.get_inputs()
         )
 
+    @parameterized.expand(affine_params)
     @pytest.mark.corstone_fvp
-    def test_conv_batchnorm_relu_u85_BI(self):
-        model = ComboConvBatchnormRelu6()
+    def test_conv_batchnorm_relu_u85_BI(self, test_suffix, affine):
+        model = ComboConvBatchnormRelu6(affine)
         self._test_conv_combo_ethos_BI_pipeline(
             model,
             common.get_u85_compile_spec(),
@@ -353,8 +364,7 @@ def test_block_bottleneck_residual_tosa_MI(self):
         model = ComboBlockBottleneckResidual()
         self._test_conv_combo_tosa_MI_pipeline(model, model.get_inputs())
 
-    # TODO: Investigate flakyness (MLTORCH-307)
-    @unittest.skip(reason="Skiped due to flakyness (MLTORCH-307)")
+    @pytest.mark.flaky  # TODO: Investigate flakyness (MLTORCH-307)
     def test_block_bottleneck_residual_tosa_BI(self):
         model = ComboBlockBottleneckResidual()
         self._test_conv_combo_tosa_BI_pipeline(model, model.get_inputs())
diff --git a/backends/arm/test/ops/test_depthwise_conv.py b/backends/arm/test/ops/test_depthwise_conv.py
index b8d69c89f1..59ce628693 100644
--- a/backends/arm/test/ops/test_depthwise_conv.py
+++ b/backends/arm/test/ops/test_depthwise_conv.py
@@ -252,8 +252,8 @@ def _test_dw_conv_ethos_BI_pipeline(
     def test_dw_conv_tosa_MI(self, test_name: str, model: torch.nn.Module):
         self._test_dw_conv_tosa_MI_pipeline(model, model.get_inputs())
 
-    # TODO: Investigate flakyness (MLTORCH-307)
     @parameterized.expand(testsuite_conv1d + testsuite_conv2d)
+    @pytest.mark.flaky  # TODO: Investigate flakyness (MLTORCH-307)
     def test_dw_conv_tosa_BI(self, test_name: str, model: torch.nn.Module):
         self._test_dw_conv_tosa_BI_pipeline(model, model.get_inputs())
 
diff --git a/backends/arm/test/ops/test_expand.py b/backends/arm/test/ops/test_expand.py
index 116f5d64e8..d0807f3db0 100644
--- a/backends/arm/test/ops/test_expand.py
+++ b/backends/arm/test/ops/test_expand.py
@@ -37,15 +37,17 @@ class Expand(torch.nn.Module):
         test_parameters = [
             (torch.rand(1), (2,)),
             (torch.randn(1, 4), (1, -1)),
-            (torch.rand(1, 1, 2, 2), (4, 3, -1, 2)),
             (torch.randn(1), (2, 2, 4)),
-            (torch.rand(3, 2, 4, 1), (-1, -1, -1, 3)),
+            (torch.randn(1, 1, 1, 5), (1, 4, -1, -1)),
             (torch.randn(1, 1, 192), (1, -1, -1)),
+            (torch.randn(1, 1), (1, 2, 2, 4)),
+            (torch.randn(1, 1), (2, 2, 2, 4)),
             (torch.randn(10, 1, 1, 97), (-1, 4, -1, -1)),
+            (torch.rand(1, 1, 2, 2), (4, 3, -1, 2)),
         ]
 
-        def forward(self, x: torch.Tensor, multiples: Sequence):
-            return x.expand(multiples)
+        def forward(self, x: torch.Tensor, m: Sequence):
+            return x.expand(m)
 
     def _test_expand_tosa_MI_pipeline(self, module: torch.nn.Module, test_data: Tuple):
         (
@@ -113,20 +115,34 @@ def test_expand_tosa_MI(self, test_input, multiples):
     def test_expand_tosa_BI(self, test_input, multiples):
         self._test_expand_tosa_BI_pipeline(self.Expand(), (test_input, multiples))
 
-    # Mismatch in provided number of inputs and model signature, MLETORCH 519
-    @parameterized.expand(Expand.test_parameters)
+    @parameterized.expand(Expand.test_parameters[:-3])
     @pytest.mark.corstone_fvp
-    @conftest.expectedFailureOnFVP
     def test_expand_u55_BI(self, test_input, multiples):
         self._test_expand_ethosu_BI_pipeline(
             common.get_u55_compile_spec(), self.Expand(), (test_input, multiples)
         )
 
-    # Mismatch in provided number of inputs and model signature, MLETORCH 519
-    @parameterized.expand(Expand.test_parameters)
+    # MLETORCH-629: Expand does not work on FVP with batch>1
+    @parameterized.expand(Expand.test_parameters[-3:])
     @pytest.mark.corstone_fvp
     @conftest.expectedFailureOnFVP
+    def test_expand_u55_BI_xfails(self, test_input, multiples):
+        self._test_expand_ethosu_BI_pipeline(
+            common.get_u55_compile_spec(), self.Expand(), (test_input, multiples)
+        )
+
+    @parameterized.expand(Expand.test_parameters[:-3])
+    @pytest.mark.corstone_fvp
     def test_expand_u85_BI(self, test_input, multiples):
         self._test_expand_ethosu_BI_pipeline(
             common.get_u85_compile_spec(), self.Expand(), (test_input, multiples)
         )
+
+    # MLETORCH-629: Expand does not work on FVP with batch>1
+    @parameterized.expand(Expand.test_parameters[-3:])
+    @pytest.mark.corstone_fvp
+    @conftest.expectedFailureOnFVP
+    def test_expand_u85_BI_xfails(self, test_input, multiples):
+        self._test_expand_ethosu_BI_pipeline(
+            common.get_u85_compile_spec(), self.Expand(), (test_input, multiples)
+        )
diff --git a/backends/arm/test/ops/test_full.py b/backends/arm/test/ops/test_full.py
index fc82fa4dd7..586e6bd4db 100644
--- a/backends/arm/test/ops/test_full.py
+++ b/backends/arm/test/ops/test_full.py
@@ -143,20 +143,16 @@ def test_full_tosa_MI(self, test_tensor: Tuple):
     def test_full_tosa_BI(self, test_tensor: Tuple):
         self._test_full_tosa_BI_pipeline(self.AddVariableFull(), test_tensor)
 
-    # Mismatch in provided number of inputs and model signature, MLETORCH 519
     @parameterized.expand(AddVariableFull.test_parameters)
     @pytest.mark.corstone_fvp
-    @conftest.expectedFailureOnFVP
     def test_full_u55_BI(self, test_tensor: Tuple):
         self._test_full_tosa_u55_pipeline(
             self.AddVariableFull(),
             test_tensor,
         )
 
-    # Mismatch in provided number of inputs and model signature, MLETORCH 519
     @parameterized.expand(AddVariableFull.test_parameters)
     @pytest.mark.corstone_fvp
-    @conftest.expectedFailureOnFVP
     def test_full_u85_BI(self, test_tensor: Tuple):
         self._test_full_tosa_u85_pipeline(
             self.AddVariableFull(),
diff --git a/backends/arm/test/ops/test_hardsigmoid.py b/backends/arm/test/ops/test_hardsigmoid.py
new file mode 100644
index 0000000000..f73a995b12
--- /dev/null
+++ b/backends/arm/test/ops/test_hardsigmoid.py
@@ -0,0 +1,128 @@
+# Copyright 2025 Arm Limited and/or its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import unittest
+
+from typing import Tuple
+
+import pytest
+import torch
+
+from executorch.backends.arm.test import common, conftest
+from executorch.backends.arm.test.tester.arm_tester import ArmTester
+from executorch.exir.backend.compile_spec_schema import CompileSpec
+from parameterized import parameterized
+
+
+test_data_suite = [
+    # (test_name, test_data)
+    ("zeros", torch.zeros(1, 10, 10, 10)),
+    ("ones", torch.ones(10, 10, 10)),
+    ("rand", torch.rand(10, 10) - 0.5),
+    ("randn_pos", torch.randn(10) + 10),
+    ("randn_neg", torch.randn(10) - 10),
+    ("ramp", torch.arange(-16, 16, 0.2)),
+]
+
+
+class TestHardsigmoid(unittest.TestCase):
+    class Hardsigmoid(torch.nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.hardsigmoid = torch.nn.Hardsigmoid()
+
+        def forward(self, x):
+            return self.hardsigmoid(x)
+
+    def _test_hardsigmoid_tosa_MI_pipeline(
+        self, module: torch.nn.Module, test_data: Tuple[torch.tensor]
+    ):
+        (
+            ArmTester(
+                module,
+                example_inputs=test_data,
+                compile_spec=common.get_tosa_compile_spec("TOSA-0.80+MI"),
+            )
+            .export()
+            .check(["torch.ops.aten.hardsigmoid.default"])
+            .check_not(["torch.ops.quantized_decomposed"])
+            .to_edge_transform_and_lower()
+            .check_not(["executorch_exir_dialects_edge__ops_aten_clamp_default"])
+            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
+            .to_executorch()
+            .run_method_and_compare_outputs(inputs=test_data)
+        )
+
+    def _test_hardsigmoid_tosa_BI_pipeline(
+        self, module: torch.nn.Module, test_data: Tuple
+    ):
+        (
+            ArmTester(
+                module,
+                example_inputs=test_data,
+                compile_spec=common.get_tosa_compile_spec("TOSA-0.80+BI"),
+            )
+            .quantize()
+            .export()
+            .check(["torch.ops.aten.hardsigmoid.default"])
+            .check(["torch.ops.quantized_decomposed"])
+            .to_edge_transform_and_lower()
+            .check_not(["executorch_exir_dialects_edge__ops_aten_clamp_default"])
+            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
+            .to_executorch()
+            .run_method_and_compare_outputs(inputs=test_data)
+        )
+
+    def _test_hardsigmoid_tosa_ethos_BI_pipeline(
+        self,
+        compile_spec: list[CompileSpec],
+        module: torch.nn.Module,
+        test_data: Tuple[torch.tensor],
+    ):
+        tester = (
+            ArmTester(
+                module,
+                example_inputs=test_data,
+                compile_spec=compile_spec,
+            )
+            .quantize()
+            .export()
+            .check_count({"torch.ops.aten.hardsigmoid.default": 1})
+            .check(["torch.ops.quantized_decomposed"])
+            .to_edge_transform_and_lower()
+            .check_not(["executorch_exir_dialects_edge__ops_aten_clamp_default"])
+            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
+            .to_executorch()
+            .serialize()
+        )
+        if conftest.is_option_enabled("corstone_fvp"):
+            tester.run_method_and_compare_outputs(qtol=1, inputs=test_data)
+
+    @parameterized.expand(test_data_suite)
+    def test_hardsigmoid_tosa_MI(
+        self,
+        test_name: str,
+        test_data: torch.Tensor,
+    ):
+        self._test_hardsigmoid_tosa_MI_pipeline(self.Hardsigmoid(), (test_data,))
+
+    @parameterized.expand(test_data_suite)
+    def test_hardsigmoid_tosa_BI(self, test_name: str, test_data: torch.Tensor):
+        self._test_hardsigmoid_tosa_BI_pipeline(self.Hardsigmoid(), (test_data,))
+
+    @parameterized.expand(test_data_suite)
+    @pytest.mark.corstone_fvp
+    def test_hardsigmoid_tosa_u55_BI(self, test_name: str, test_data: torch.Tensor):
+        self._test_hardsigmoid_tosa_ethos_BI_pipeline(
+            common.get_u55_compile_spec(), self.Hardsigmoid(), (test_data,)
+        )
+
+    @parameterized.expand(test_data_suite)
+    @pytest.mark.corstone_fvp
+    def test_hardsigmoid_tosa_u85_BI(self, test_name: str, test_data: torch.Tensor):
+        self._test_hardsigmoid_tosa_ethos_BI_pipeline(
+            common.get_u85_compile_spec(), self.Hardsigmoid(), (test_data,)
+        )
diff --git a/backends/arm/test/ops/test_hardswish.py b/backends/arm/test/ops/test_hardswish.py
new file mode 100644
index 0000000000..81aba540e3
--- /dev/null
+++ b/backends/arm/test/ops/test_hardswish.py
@@ -0,0 +1,128 @@
+# Copyright 2025 Arm Limited and/or its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import unittest
+
+from typing import Tuple
+
+import pytest
+import torch
+
+from executorch.backends.arm.test import common, conftest
+from executorch.backends.arm.test.tester.arm_tester import ArmTester
+from executorch.exir.backend.compile_spec_schema import CompileSpec
+from parameterized import parameterized
+
+
+test_data_suite = [
+    # (test_name, test_data)
+    ("zeros", torch.zeros(1, 10, 10, 10)),
+    ("ones", torch.ones(10, 10, 10)),
+    ("rand", torch.rand(10, 10) - 0.5),
+    ("randn_pos", torch.randn(10) + 10),
+    ("randn_neg", torch.randn(10) - 10),
+    ("ramp", torch.arange(-16, 16, 0.2)),
+]
+
+
+class TestHardswish(unittest.TestCase):
+    class Hardswish(torch.nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.hardswish = torch.nn.Hardswish()
+
+        def forward(self, x):
+            return self.hardswish(x)
+
+    def _test_hardswish_tosa_MI_pipeline(
+        self, module: torch.nn.Module, test_data: Tuple[torch.tensor]
+    ):
+        (
+            ArmTester(
+                module,
+                example_inputs=test_data,
+                compile_spec=common.get_tosa_compile_spec("TOSA-0.80+MI"),
+            )
+            .export()
+            .check(["torch.ops.aten.hardswish.default"])
+            .check_not(["torch.ops.quantized_decomposed"])
+            .to_edge_transform_and_lower()
+            .check_not(["executorch_exir_dialects_edge__ops_aten_clamp_default"])
+            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
+            .to_executorch()
+            .run_method_and_compare_outputs(inputs=test_data)
+        )
+
+    def _test_hardswish_tosa_BI_pipeline(
+        self, module: torch.nn.Module, test_data: Tuple
+    ):
+        (
+            ArmTester(
+                module,
+                example_inputs=test_data,
+                compile_spec=common.get_tosa_compile_spec("TOSA-0.80+BI"),
+            )
+            .quantize()
+            .export()
+            .check(["torch.ops.aten.hardswish.default"])
+            .check(["torch.ops.quantized_decomposed"])
+            .to_edge_transform_and_lower()
+            .check_not(["executorch_exir_dialects_edge__ops_aten_clamp_default"])
+            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
+            .to_executorch()
+            .run_method_and_compare_outputs(inputs=test_data)
+        )
+
+    def _test_hardswish_tosa_ethos_BI_pipeline(
+        self,
+        compile_spec: list[CompileSpec],
+        module: torch.nn.Module,
+        test_data: Tuple[torch.tensor],
+    ):
+        tester = (
+            ArmTester(
+                module,
+                example_inputs=test_data,
+                compile_spec=compile_spec,
+            )
+            .quantize()
+            .export()
+            .check_count({"torch.ops.aten.hardswish.default": 1})
+            .check(["torch.ops.quantized_decomposed"])
+            .to_edge_transform_and_lower()
+            .check_not(["executorch_exir_dialects_edge__ops_aten_clamp_default"])
+            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
+            .to_executorch()
+            .serialize()
+        )
+        if conftest.is_option_enabled("corstone_fvp"):
+            tester.run_method_and_compare_outputs(qtol=1, inputs=test_data)
+
+    @parameterized.expand(test_data_suite)
+    def test_hardswish_tosa_MI(
+        self,
+        test_name: str,
+        test_data: torch.Tensor,
+    ):
+        self._test_hardswish_tosa_MI_pipeline(self.Hardswish(), (test_data,))
+
+    @parameterized.expand(test_data_suite)
+    def test_hardswish_tosa_BI(self, test_name: str, test_data: torch.Tensor):
+        self._test_hardswish_tosa_BI_pipeline(self.Hardswish(), (test_data,))
+
+    @parameterized.expand(test_data_suite)
+    @pytest.mark.corstone_fvp
+    def test_hardswish_tosa_u55_BI(self, test_name: str, test_data: torch.Tensor):
+        self._test_hardswish_tosa_ethos_BI_pipeline(
+            common.get_u55_compile_spec(), self.Hardswish(), (test_data,)
+        )
+
+    @parameterized.expand(test_data_suite)
+    @pytest.mark.corstone_fvp
+    def test_hardswish_tosa_u85_BI(self, test_name: str, test_data: torch.Tensor):
+        self._test_hardswish_tosa_ethos_BI_pipeline(
+            common.get_u85_compile_spec(), self.Hardswish(), (test_data,)
+        )
diff --git a/backends/arm/test/ops/test_layer_norm.py b/backends/arm/test/ops/test_layer_norm.py
index c287f51ebc..82f0af8dcf 100644
--- a/backends/arm/test/ops/test_layer_norm.py
+++ b/backends/arm/test/ops/test_layer_norm.py
@@ -109,7 +109,7 @@ def _test_layernorm_tosa_BI_pipeline(
             .partition()
             .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
             .to_executorch()
-            .run_method_and_compare_outputs(inputs=test_data)
+            .run_method_and_compare_outputs(qtol=1, inputs=test_data)
         )
 
     def _test_layernorm_ethosu_BI_pipeline(
diff --git a/backends/arm/test/ops/test_logsoftmax.py b/backends/arm/test/ops/test_logsoftmax.py
index d1581423a0..f34d4afbb5 100644
--- a/backends/arm/test/ops/test_logsoftmax.py
+++ b/backends/arm/test/ops/test_logsoftmax.py
@@ -6,7 +6,9 @@
 
 import unittest
 
-from typing import Tuple
+from typing import Callable, Tuple
+
+import pytest
 
 import torch
 from executorch.backends.arm.test import common
@@ -15,27 +17,27 @@
 from parameterized import parameterized
 
 
-test_data_suite = [
+test_data_generators = [
     # (test_name, test_data, dim)
-    ("zeros", torch.zeros(10, 8, 5, 2), 0),
-    ("zeros_neg_dim", torch.zeros(10, 7, 8, 9), -4),
-    ("ones", torch.ones(10, 10), 1),
-    ("ones_neg_dim", torch.ones(10, 3, 4), -1),
-    ("rand", torch.rand(1, 2, 5, 8), 2),
-    ("rand_neg_dim", torch.rand(2, 10, 8, 10), -2),
-    ("randn", torch.randn(10, 10, 10, 10), 3),
-    ("randn_neg_dim", torch.randn(10, 5, 8, 7), -3),
+    lambda: ("zeros", torch.zeros(10, 8, 5, 2), 0),
+    lambda: ("zeros_neg_dim", torch.zeros(10, 7, 8, 9), -4),
+    lambda: ("ones", torch.ones(10, 10), 1),
+    lambda: ("ones_neg_dim", torch.ones(10, 3, 4), -1),
+    lambda: ("rand", torch.rand(1, 2, 5, 8), 2),
+    lambda: ("rand_neg_dim", torch.rand(2, 10, 8, 10), -2),
+    lambda: ("randn", torch.randn(10, 10, 10, 10), 3),
+    lambda: ("randn_neg_dim", torch.randn(10, 5, 8, 7), -3),
 ]
-test_data_suite_u55 = [
+test_data_generators_u55 = [
     # (test_name, test_data, dim)
-    ("ones", torch.ones(10, 10), 1),
-    ("ones_neg_dim", torch.ones(10, 3, 4), -1),
-    ("randn_neg_dim", torch.randn(10, 5, 8, 7), -3),
-    ("zeros", torch.zeros(10, 8, 5, 2), 0),
-    ("zeros_neg_dim", torch.zeros(10, 7, 8, 9), -4),
-    ("rand", torch.rand(1, 2, 5, 8), 2),
-    ("rand_neg_dim", torch.rand(2, 10, 8, 10), -2),
-    ("randn", torch.randn(10, 10, 10, 10), 3),
+    lambda: ("ones", torch.ones(10, 10), 1),
+    lambda: ("ones_neg_dim", torch.ones(10, 3, 4), -1),
+    lambda: ("randn_neg_dim", torch.randn(10, 5, 8, 7), -3),
+    lambda: ("zeros", torch.zeros(10, 8, 5, 2), 0),
+    lambda: ("zeros_neg_dim", torch.zeros(10, 7, 8, 9), -4),
+    lambda: ("rand", torch.rand(1, 2, 5, 8), 2),
+    lambda: ("rand_neg_dim", torch.rand(2, 10, 8, 10), -2),
+    lambda: ("randn", torch.randn(10, 10, 10, 10), 3),
 ]
 
 
@@ -128,42 +130,29 @@ def _test_logsoftmax_tosa_u85_BI_pipeline(
             common.get_u85_compile_spec(), module, test_data
         )
 
-    @parameterized.expand(test_data_suite)
-    def test_logsoftmax_tosa_MI(
-        self,
-        test_name: str,
-        test_data: torch.Tensor,
-        dim: int,
-    ):
+    @parameterized.expand(test_data_generators)
+    def test_logsoftmax_tosa_MI(self, test_data_generator: Callable[[], Tuple]):
+        test_name, test_data, dim = test_data_generator()
         self._test_logsoftmax_tosa_MI_pipeline(self.LogSoftmax(dim=dim), (test_data,))
 
-    @parameterized.expand(test_data_suite)
-    def test_logsoftmax_tosa_BI(
-        self,
-        test_name: str,
-        test_data: torch.Tensor,
-        dim: int,
-    ):
+    @parameterized.expand(test_data_generators)
+    @pytest.mark.flaky  # TODO: MLETORCH-460 - Numerically stabler (log)softmax implementation
+    def test_logsoftmax_tosa_BI(self, test_data_generator: Callable[[], Tuple]):
+        test_name, test_data, dim = test_data_generator()
         self._test_logsoftmax_tosa_BI_pipeline(self.LogSoftmax(dim=dim), (test_data,))
 
-    @parameterized.expand(test_data_suite_u55)
-    def test_logsoftmax_tosa_u55_BI(
-        self,
-        test_name: str,
-        test_data: torch.Tensor,
-        dim: int,
-    ):
+    @parameterized.expand(test_data_generators_u55)
+    @pytest.mark.flaky  # TODO: MLETORCH-460 - Numerically stabler (log)softmax implementation
+    def test_logsoftmax_tosa_u55_BI(self, test_data_generator: Callable[[], Tuple]):
+        test_name, test_data, dim = test_data_generator()
         self._test_logsoftmax_tosa_u55_BI_pipeline(
             self.LogSoftmax(dim=dim), (test_data,)
         )
 
-    @parameterized.expand(test_data_suite)
-    def test_logsoftmax_tosa_u85_BI(
-        self,
-        test_name: str,
-        test_data: torch.Tensor,
-        dim: int,
-    ):
+    @parameterized.expand(test_data_generators)
+    @pytest.mark.flaky  # TODO: MLETORCH-460 - Numerically stabler (log)softmax implementation
+    def test_logsoftmax_tosa_u85_BI(self, test_data_generator: Callable[[], Tuple]):
+        test_name, test_data, dim = test_data_generator()
         self._test_logsoftmax_tosa_u85_BI_pipeline(
             self.LogSoftmax(dim=dim), (test_data,)
         )
diff --git a/backends/arm/test/ops/test_maximum.py b/backends/arm/test/ops/test_maximum.py
index 1fe2c20148..a255496d51 100644
--- a/backends/arm/test/ops/test_maximum.py
+++ b/backends/arm/test/ops/test_maximum.py
@@ -109,7 +109,6 @@ def test_maximum_tosa_BI(self, operand1: torch.Tensor, operand2: torch.Tensor):
         self._test_maximum_tosa_BI_pipeline(self.Maximum(), test_data)
 
     @parameterized.expand(Maximum.test_parameters)
-    @unittest.expectedFailure  # Bug in Vela, disabled until pin changes, bug MLETORCH-513
     def test_maximum_u55_BI(self, operand1: torch.Tensor, operand2: torch.Tensor):
         test_data = (operand1, operand2)
         tester = self._test_maximum_ethos_BI_pipeline(
diff --git a/backends/arm/test/ops/test_minimum.py b/backends/arm/test/ops/test_minimum.py
index d455ca1d43..04693a4643 100644
--- a/backends/arm/test/ops/test_minimum.py
+++ b/backends/arm/test/ops/test_minimum.py
@@ -109,7 +109,6 @@ def test_minimum_tosa_BI(self, operand1: torch.Tensor, operand2: torch.Tensor):
         self._test_minimum_tosa_BI_pipeline(self.Minimum(), test_data)
 
     @parameterized.expand(Minimum.test_parameters)
-    @unittest.expectedFailure  # Bug in Vela, disabled until pin changes, bug MLETORCH-513
     def test_minimum_u55_BI(self, operand1: torch.Tensor, operand2: torch.Tensor):
         test_data = (operand1, operand2)
         tester = self._test_minimum_ethos_BI_pipeline(
diff --git a/backends/arm/test/ops/test_mm.py b/backends/arm/test/ops/test_mm.py
index 5fa28076aa..d9b58da904 100644
--- a/backends/arm/test/ops/test_mm.py
+++ b/backends/arm/test/ops/test_mm.py
@@ -1,4 +1,4 @@
-# Copyright 2024 Arm Limited and/or its affiliates.
+# Copyright 2024-2025 Arm Limited and/or its affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the BSD-style license found in the
@@ -7,8 +7,9 @@
 import logging
 import unittest
 
-from typing import Tuple
+from typing import Callable, Tuple
 
+import pytest
 import torch
 from executorch.backends.arm.test import common
 from executorch.backends.arm.test.tester.arm_tester import ArmTester
@@ -18,30 +19,28 @@
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.INFO)
 
-torch.manual_seed(0)
-
 
 class TestMM(unittest.TestCase):
     """Tests MatMul"""
 
     class MM(torch.nn.Module):
-        test_parameters = [
-            (torch.rand(3, 5), torch.rand(5, 2)),
-            (torch.rand(1, 1), torch.rand(1, 1)),
-            (torch.ones(55, 3), torch.ones(3, 44)),
-            (10000 * torch.randn(1, 10), torch.randn(10, 5)),
-            (-10 * torch.randn(32, 64), 5 + 5 * torch.randn(64, 32)),
+        test_data_generators = [
+            lambda: (torch.rand(3, 5), torch.rand(5, 2)),
+            lambda: (torch.rand(1, 1), torch.rand(1, 1)),
+            lambda: (torch.ones(55, 3), torch.ones(3, 44)),
+            lambda: (10000 * torch.randn(1, 10), torch.randn(10, 5)),
+            lambda: (-10 * torch.randn(32, 64), 5 + 5 * torch.randn(64, 32)),
         ]
 
         def forward(self, x, y):
             return torch.mm(x, y)
 
     class MMSingleInput(torch.nn.Module):
-        test_parameters = [
-            (torch.rand(3, 3),),
-            (torch.ones(128, 128),),
-            (10000 * torch.randn(25, 25),),
-            (5 + 5 * torch.randn(64, 64),),
+        test_data_generators = [
+            lambda: (torch.rand(3, 3),),
+            lambda: (torch.ones(128, 128),),
+            lambda: (10000 * torch.randn(25, 25),),
+            lambda: (5 + 5 * torch.randn(64, 64),),
         ]
 
         def forward(self, x):
@@ -110,54 +109,55 @@ def _test_mm_ethosu_BI_pipeline(
             .to_executorch()
         )
 
-    @parameterized.expand(MM.test_parameters)
-    def test_mm_tosa_MI(self, operand1: torch.Tensor, operand2: torch.Tensor):
-        test_data = (operand1, operand2)
+    @parameterized.expand(MM.test_data_generators)
+    def test_mm_tosa_MI(self, test_data_generator: Callable[[], Tuple]):
+        test_data = test_data_generator()
         self._test_mm_tosa_MI_pipeline(self.MM(), test_data)
 
-    @parameterized.expand(MMSingleInput.test_parameters)
-    def test_mm_single_input_tosa_MI(self, operand1: torch.Tensor):
-        test_data = (operand1,)
+    @parameterized.expand(MMSingleInput.test_data_generators)
+    def test_mm_single_input_tosa_MI(self, test_data_generator: Callable[[], Tuple]):
+        test_data = test_data_generator()
         self._test_mm_tosa_MI_pipeline(self.MMSingleInput(), test_data)
 
-    @parameterized.expand(MM.test_parameters)
-    def test_mm_tosa_BI(self, operand1: torch.Tensor, operand2: torch.Tensor):
-        test_data = (operand1, operand2)
+    @parameterized.expand(MM.test_data_generators)
+    @pytest.mark.flaky  # TODO: Investigate flakyness (MLETORCH-534)
+    def test_mm_tosa_BI(self, test_data_generator: Callable[[], Tuple]):
+        test_data = test_data_generator()
         self._test_mm_tosa_BI_pipeline(self.MM(), test_data)
 
-    @parameterized.expand(MMSingleInput.test_parameters)
-    def test_mm_single_input_tosa_BI(self, operand1: torch.Tensor):
-        test_data = (operand1,)
+    @parameterized.expand(MMSingleInput.test_data_generators)
+    def test_mm_single_input_tosa_BI(self, test_data_generator: Callable[[], Tuple]):
+        test_data = test_data_generator()
         self._test_mm_tosa_BI_pipeline(self.MMSingleInput(), test_data)
 
     # Expected to fail with error: CPU performance estimation for "MatMul" not implemented
-    @parameterized.expand(MM.test_parameters)
+    @parameterized.expand(MM.test_data_generators)
     @unittest.expectedFailure
-    def test_mm_u55_BI(self, operand1: torch.Tensor, operand2: torch.Tensor):
-        test_data = (operand1, operand2)
+    def test_mm_u55_BI(self, test_data_generator: Callable[[], Tuple]):
+        test_data = test_data_generator()
         self._test_mm_ethosu_BI_pipeline(
             common.get_u55_compile_spec(), self.MM(), test_data
         )
 
     # Expected to fail with error: Warning, unsupported fusing of TOSA Rescale previous operator is of type: Memcpy
-    @parameterized.expand(MMSingleInput.test_parameters)
+    @parameterized.expand(MMSingleInput.test_data_generators)
     @unittest.expectedFailure
-    def test_mm_single_input_u55_BI(self, operand1: torch.Tensor):
-        test_data = (operand1,)
+    def test_mm_single_input_u55_BI(self, test_data_generator: Callable[[], Tuple]):
+        test_data = test_data_generator()
         self._test_mm_ethosu_BI_pipeline(
             common.get_u55_compile_spec(), self.MMSingleInput(), test_data
         )
 
-    @parameterized.expand(MM.test_parameters)
-    def test_mm_u85_BI(self, operand1: torch.Tensor, operand2: torch.Tensor):
-        test_data = (operand1, operand2)
+    @parameterized.expand(MM.test_data_generators)
+    def test_mm_u85_BI(self, test_data_generator: Callable[[], Tuple]):
+        test_data = test_data_generator()
         self._test_mm_ethosu_BI_pipeline(
             common.get_u85_compile_spec(), self.MM(), test_data
         )
 
-    @parameterized.expand(MMSingleInput.test_parameters)
-    def test_mm_single_input_u85_BI(self, operand1: torch.Tensor):
-        test_data = (operand1,)
+    @parameterized.expand(MMSingleInput.test_data_generators)
+    def test_mm_single_input_u85_BI(self, test_data_generator: Callable[[], Tuple]):
+        test_data = test_data_generator()
         self._test_mm_ethosu_BI_pipeline(
             common.get_u85_compile_spec(), self.MMSingleInput(), test_data
         )
diff --git a/backends/arm/test/ops/test_softmax.py b/backends/arm/test/ops/test_softmax.py
index 794f6b791f..c60da18594 100644
--- a/backends/arm/test/ops/test_softmax.py
+++ b/backends/arm/test/ops/test_softmax.py
@@ -7,7 +7,9 @@
 
 import unittest
 
-from typing import Tuple
+from typing import Callable, Tuple
+
+import pytest
 
 import torch
 from executorch.backends.arm.test import common
@@ -16,28 +18,28 @@
 from parameterized import parameterized
 
 
-test_data_suite = [
+test_data_generators = [
     # (test_name, test_data, dim)
-    ("zeros", torch.zeros(10, 8, 5, 2), 0),
-    ("zeros_neg_dim", torch.zeros(10, 7, 8, 9), -4),
-    ("ones", torch.ones(10, 10), 1),
-    ("ones_neg_dim", torch.ones(10, 3, 4), -1),
-    ("rand", torch.rand(1, 2, 5, 8), 2),
-    ("rand_neg_dim", torch.rand(2, 10, 8, 10), -2),
-    ("randn", torch.randn(10, 10, 10, 10), 3),
-    ("randn_neg_dim", torch.randn(10, 5, 8, 7), -3),
+    lambda: ("zeros", torch.zeros(10, 8, 5, 2), 0),
+    lambda: ("zeros_neg_dim", torch.zeros(10, 7, 8, 9), -4),
+    lambda: ("ones", torch.ones(10, 10), 1),
+    lambda: ("ones_neg_dim", torch.ones(10, 3, 4), -1),
+    lambda: ("rand", torch.rand(1, 2, 5, 8), 2),
+    lambda: ("rand_neg_dim", torch.rand(2, 10, 8, 10), -2),
+    lambda: ("randn", torch.randn(10, 10, 10, 10), 3),
+    lambda: ("randn_neg_dim", torch.randn(10, 5, 8, 7), -3),
 ]
 
-test_data_suite_u55 = [
+test_data_generators_u55 = [
     # (test_name, test_data, dim)
-    ("ones", torch.ones(10, 10), 1),
-    ("ones_neg_dim", torch.ones(10, 3, 4), -1),
-    ("randn_neg_dim", torch.randn(10, 5, 8, 7), -3),
-    ("zeros", torch.zeros(10, 8, 5, 2), 0),
-    ("zeros_neg_dim", torch.zeros(10, 7, 8, 9), -4),
-    ("rand", torch.rand(1, 2, 5, 8), 2),
-    ("rand_neg_dim", torch.rand(2, 10, 8, 10), -2),
-    ("randn", torch.randn(10, 10, 10, 10), 3),
+    lambda: ("ones", torch.ones(10, 10), 1),
+    lambda: ("ones_neg_dim", torch.ones(10, 3, 4), -1),
+    lambda: ("randn_neg_dim", torch.randn(10, 5, 8, 7), -3),
+    lambda: ("zeros", torch.zeros(10, 8, 5, 2), 0),
+    lambda: ("zeros_neg_dim", torch.zeros(10, 7, 8, 9), -4),
+    lambda: ("rand", torch.rand(1, 2, 5, 8), 2),
+    lambda: ("rand_neg_dim", torch.rand(2, 10, 8, 10), -2),
+    lambda: ("randn", torch.randn(10, 10, 10, 10), 3),
 ]
 
 
@@ -130,38 +132,25 @@ def _test_softmax_tosa_u85_BI_pipeline(
             common.get_u85_compile_spec(), module, test_data
         )
 
-    @parameterized.expand(test_data_suite)
-    def test_softmax_tosa_MI(
-        self,
-        test_name: str,
-        test_data: torch.Tensor,
-        dim: int,
-    ):
+    @parameterized.expand(test_data_generators)
+    def test_softmax_tosa_MI(self, test_data_generator: Callable[[], Tuple]):
+        test_name, test_data, dim = test_data_generator()
         self._test_softmax_tosa_MI_pipeline(self.Softmax(dim=dim), (test_data,))
 
-    @parameterized.expand(test_data_suite)
-    def test_softmax_tosa_BI(
-        self,
-        test_name: str,
-        test_data: torch.Tensor,
-        dim: int,
-    ):
+    @parameterized.expand(test_data_generators)
+    @pytest.mark.flaky  # TODO: MLETORCH-460 - Numerically stabler (log)softmax implementation
+    def test_softmax_tosa_BI(self, test_data_generator: Callable[[], Tuple]):
+        test_name, test_data, dim = test_data_generator()
         self._test_softmax_tosa_BI_pipeline(self.Softmax(dim=dim), (test_data,))
 
-    @parameterized.expand(test_data_suite_u55)
-    def test_softmax_tosa_u55_BI(
-        self,
-        test_name: str,
-        test_data: torch.Tensor,
-        dim: int,
-    ):
+    @parameterized.expand(test_data_generators_u55)
+    @pytest.mark.flaky  # TODO: MLETORCH-460 - Numerically stabler (log)softmax implementation
+    def test_softmax_tosa_u55_BI(self, test_data_generator: Callable[[], Tuple]):
+        test_name, test_data, dim = test_data_generator()
         self._test_softmax_tosa_u55_BI_pipeline(self.Softmax(dim=dim), (test_data,))
 
-    @parameterized.expand(test_data_suite)
-    def test_softmax_tosa_u85_BI(
-        self,
-        test_name: str,
-        test_data: torch.Tensor,
-        dim: int,
-    ):
+    @parameterized.expand(test_data_generators)
+    @pytest.mark.flaky  # TODO: MLETORCH-460 - Numerically stabler (log)softmax implementation
+    def test_softmax_tosa_u85_BI(self, test_data_generator: Callable[[], Tuple]):
+        test_name, test_data, dim = test_data_generator()
         self._test_softmax_tosa_u85_BI_pipeline(self.Softmax(dim=dim), (test_data,))
diff --git a/backends/arm/test/passes/test_cast_int64_pass.py b/backends/arm/test/passes/test_cast_int64_pass.py
new file mode 100644
index 0000000000..fdfab1f3af
--- /dev/null
+++ b/backends/arm/test/passes/test_cast_int64_pass.py
@@ -0,0 +1,44 @@
+# Copyright 2025 Arm Limited and/or its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import unittest
+
+import torch
+from executorch.backends.arm._passes.cast_int64_pass import CastInt64ToInt32Pass
+
+from executorch.backends.arm.test import common
+
+from executorch.backends.arm.test.tester.arm_tester import ArmTester, RunPasses
+
+
+class Int64Model(torch.nn.Module):
+
+    def forward(self, x: torch.Tensor):
+        return x + 3
+
+    def get_inputs(self):
+        return (torch.rand(4),)
+
+
+class TestCastInt64Pass(unittest.TestCase):
+
+    def test_int64_model(self):
+        module = Int64Model()
+        test_pass_stage = RunPasses(passes_with_exported_program=[CastInt64ToInt32Pass])
+        tester = (
+            ArmTester(
+                module,
+                example_inputs=module.get_inputs(),
+                compile_spec=common.get_tosa_compile_spec("TOSA-0.80+BI"),
+            )
+            .export()
+            .to_edge()
+            .run_passes(test_pass_stage)
+            .run_method_and_compare_outputs()
+        )
+        exported_program = tester.get_artifact("RunPasses").exported_program()
+        for state in exported_program.state_dict:
+            assert exported_program.state_dict[state].dtype != torch.int64
diff --git a/backends/arm/test/passes/test_fuse_batchnorm_pass.py b/backends/arm/test/passes/test_fuse_batchnorm_pass.py
new file mode 100644
index 0000000000..09f8f578fc
--- /dev/null
+++ b/backends/arm/test/passes/test_fuse_batchnorm_pass.py
@@ -0,0 +1,158 @@
+# Copyright 2025 Arm Limited and/or its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+import unittest
+
+import torch
+from executorch.backends.arm._passes.fuse_batchnorm2d_pass import FuseBatchnorm2DPass
+from executorch.backends.arm.test import common
+from executorch.backends.arm.test.tester.arm_tester import ArmTester, RunPasses
+from parameterized import parameterized
+
+
+class MergeOneOfTwoBN(torch.nn.Module):
+    ops_before_pass = {
+        "executorch_exir_dialects_edge__ops_aten__native_batch_norm_legit_no_training_default": 2,
+        "executorch_exir_dialects_edge__ops_aten_convolution_default": 1,
+    }
+    ops_after_pass = {
+        "executorch_exir_dialects_edge__ops_aten__native_batch_norm_legit_no_training_default": 1,
+        "executorch_exir_dialects_edge__ops_aten_convolution_default": 1,
+    }
+
+    def __init__(self, affine: bool):
+        super().__init__()
+        self.conv2d = torch.nn.Conv2d(
+            in_channels=3, out_channels=3, kernel_size=3, stride=1, groups=1
+        )
+        self.batch_norm2d = torch.nn.BatchNorm2d(3, affine=affine)
+        self.batch_norm2d.running_mean = torch.rand(3)
+        self.batch_norm2d.running_var = torch.rand(3)
+        if affine:
+            self.batch_norm2d.weight = torch.nn.Parameter(torch.rand(3))
+            self.batch_norm2d.bias = torch.nn.Parameter(torch.rand(3))
+        self.relu6 = torch.nn.ReLU6()
+
+    def get_inputs(self) -> tuple[torch.Tensor]:
+        return (torch.randn(1, 3, 256, 256),)
+
+    def forward(self, x):
+        x = self.conv2d(x)
+        x = self.batch_norm2d(x)
+        x = self.relu6(x)
+        x = self.batch_norm2d(x)
+        return x
+
+
+class MergeTwosOfTwoBN(torch.nn.Module):
+    ops_before_pass = {
+        "executorch_exir_dialects_edge__ops_aten__native_batch_norm_legit_no_training_default": 2,
+        "executorch_exir_dialects_edge__ops_aten_convolution_default": 2,
+    }
+    ops_after_pass = {
+        "executorch_exir_dialects_edge__ops_aten__native_batch_norm_legit_no_training_default": 0,
+        "executorch_exir_dialects_edge__ops_aten_convolution_default": 2,
+    }
+
+    def __init__(self, affine: bool):
+        super().__init__()
+        self.conv2d = torch.nn.Conv2d(
+            in_channels=3, out_channels=3, kernel_size=3, stride=1, groups=1
+        )
+        self.conv2d2 = torch.nn.Conv2d(
+            in_channels=3, out_channels=3, kernel_size=3, stride=1, groups=1
+        )
+        self.batch_norm2d = torch.nn.BatchNorm2d(3, affine=affine)
+        self.batch_norm2d.running_mean = torch.rand(3)
+        self.batch_norm2d.running_var = torch.rand(3)
+        if affine:
+            self.batch_norm2d.weight = torch.nn.Parameter(torch.rand(3))
+            self.batch_norm2d.bias = torch.nn.Parameter(torch.rand(3))
+        self.relu6 = torch.nn.ReLU6()
+
+    def get_inputs(self) -> tuple[torch.Tensor]:
+        return (torch.randn(1, 3, 256, 256),)
+
+    def forward(self, x):
+        x = self.conv2d(x)
+        x = self.batch_norm2d(x)
+        x = self.relu6(x)
+        x = self.conv2d2(x)
+        x = self.batch_norm2d(x)
+        return x
+
+
+class MergeNoBN(torch.nn.Module):
+    ops_before_pass = {
+        "executorch_exir_dialects_edge__ops_aten__native_batch_norm_legit_no_training_default": 2,
+        "executorch_exir_dialects_edge__ops_aten_convolution_default": 3,
+    }
+    ops_after_pass = {
+        "executorch_exir_dialects_edge__ops_aten__native_batch_norm_legit_no_training_default": 2,
+        "executorch_exir_dialects_edge__ops_aten_convolution_default": 3,
+    }
+
+    def __init__(self, affine: bool):
+        super().__init__()
+        self.conv2d = torch.nn.Conv2d(
+            in_channels=3, out_channels=3, kernel_size=3, stride=1, groups=1
+        )
+        self.conv2d2 = torch.nn.Conv2d(
+            in_channels=3, out_channels=3, kernel_size=3, stride=1, groups=1
+        )
+        self.batch_norm2d = torch.nn.BatchNorm2d(3, affine=affine)
+        self.batch_norm2d.running_mean = torch.rand(3)
+        self.batch_norm2d.running_var = torch.rand(3)
+        if affine:
+            self.batch_norm2d.weight = torch.nn.Parameter(torch.rand(3))
+            self.batch_norm2d.bias = torch.nn.Parameter(torch.rand(3))
+        self.relu6 = torch.nn.ReLU6()
+
+    def get_inputs(self) -> tuple[torch.Tensor]:
+        return (torch.randn(1, 3, 256, 256),)
+
+    def forward(self, x):
+        x1 = self.conv2d(x)
+        x = self.batch_norm2d(x1)  # Can't be fused since x1 has multiple users
+        x = self.relu6(x)
+        y = self.conv2d2(x1)
+        z = self.conv2d2(x)
+        a = self.batch_norm2d(
+            y
+        )  # Can't be fused since paramters of conv2d2 have multiple users.
+
+        return z, a
+
+
+modules = [
+    MergeOneOfTwoBN(True),
+    MergeOneOfTwoBN(False),
+    MergeTwosOfTwoBN(True),
+    MergeNoBN(True),
+]
+
+
+class TestFuseBatchnormPass(unittest.TestCase):
+
+    @parameterized.expand(modules)
+    def test_fuse_batchnorm_tosa_MI(self, module):
+        """Test various cases where the batchnorm should and shouldn't be fused."""
+        inputs = module.get_inputs()
+        test_pass_stage = RunPasses(passes_with_exported_program=[FuseBatchnorm2DPass])
+        (
+            (
+                ArmTester(
+                    module,
+                    example_inputs=inputs,
+                    compile_spec=common.get_tosa_compile_spec("TOSA-0.80+MI"),
+                )
+                .export()
+                .to_edge()
+                .check_count(module.ops_before_pass)
+                .run_passes(test_pass_stage)
+                .check_count(module.ops_after_pass)
+                .run_method_and_compare_outputs()
+            )
+        )
diff --git a/backends/arm/test/passes/test_insert_table_ops_pass.py b/backends/arm/test/passes/test_insert_table_ops_pass.py
new file mode 100644
index 0000000000..c0a9235fa6
--- /dev/null
+++ b/backends/arm/test/passes/test_insert_table_ops_pass.py
@@ -0,0 +1,55 @@
+# Copyright 2025 Arm Limited and/or its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import unittest
+
+import torch
+from executorch.backends.arm._passes.fold_qdq_with_annotated_qparams_pass import (
+    FoldAndAnnotateQParamsPass,
+)
+from executorch.backends.arm._passes.insert_table_ops import InsertTableOpsPass
+
+from executorch.backends.arm.test import common
+
+from executorch.backends.arm.test.tester.arm_tester import ArmTester, RunPasses
+
+
+class Sigmoid(torch.nn.Module):
+
+    def forward(self, x: torch.Tensor):
+        return x.sigmoid()
+
+    def get_inputs(self):
+        return (torch.rand(4),)
+
+
+class TestInsertTablePass(unittest.TestCase):
+
+    def test_insert_table_tosa_BI(self):
+        module = Sigmoid()
+        test_pass_stage = RunPasses(
+            [FoldAndAnnotateQParamsPass],
+            passes_with_exported_program=[InsertTableOpsPass],
+        )
+        (
+            ArmTester(
+                module,
+                example_inputs=module.get_inputs(),
+                compile_spec=common.get_tosa_compile_spec("TOSA-0.80+BI"),
+            )
+            .quantize()
+            .export()
+            .to_edge()
+            .run_passes(test_pass_stage)
+            .check("tosa._table")
+            .check_count(
+                {
+                    "executorch_exir_dialects_edge__ops_quantized_decomposed_quantize_per_tensor_default": 1,
+                    "executorch_exir_dialects_edge__ops_quantized_decomposed_dequantize_per_tensor_default": 1,
+                }
+            )
+            .check_not(["aten_sigmoid_default"])
+        )
diff --git a/backends/arm/test/passes/test_ioquantization_pass.py b/backends/arm/test/passes/test_ioquantization_pass.py
new file mode 100644
index 0000000000..e31007f1ed
--- /dev/null
+++ b/backends/arm/test/passes/test_ioquantization_pass.py
@@ -0,0 +1,70 @@
+# Copyright 2025 Arm Limited and/or its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import unittest
+
+import torch
+
+from executorch.backends.arm.test import common
+
+from executorch.backends.arm.test.tester.arm_tester import ArmTester
+from executorch.exir.passes.quantize_io_pass import QuantizeInputs, QuantizeOutputs
+
+
+class SimpleModel(torch.nn.Module):
+    def forward(self, x, y):
+        return x + y
+
+    def get_inputs(self):
+        a = torch.rand(1, 2, 2, 1)
+        b = torch.rand(1, 2, 2, 1)
+        return (a, b)
+
+
+class TestIOQuantizationPass(unittest.TestCase):
+    """
+    Test the executorch/exir/passes/quanize_io_pass pass works(meaning we don't get Q/DQ nodes) on a simple model
+    """
+
+    def test_ioquantisation_pass(self):
+        model = SimpleModel()
+        tester = (
+            ArmTester(
+                model,
+                example_inputs=model.get_inputs(),
+                compile_spec=common.get_u55_compile_spec(),
+            )
+            .quantize()
+            .export()
+            .to_edge()
+            .check_count(
+                {
+                    "executorch_exir_dialects_edge__ops_quantized_decomposed_quantize_per_tensor_default": 3
+                }
+            )
+            .check_count(
+                {
+                    "executorch_exir_dialects_edge__ops_quantized_decomposed_dequantize_per_tensor_default": 3
+                }
+            )
+            .partition()
+            .check_count(
+                {
+                    "executorch_exir_dialects_edge__ops_quantized_decomposed_quantize_per_tensor_default": 2
+                }
+            )
+            .check_count(
+                {
+                    "executorch_exir_dialects_edge__ops_quantized_decomposed_dequantize_per_tensor_default": 1
+                }
+            )
+        )
+        edge = tester.get_artifact()
+        edge.transform(
+            passes=[QuantizeInputs(edge, [0, 1]), QuantizeOutputs(edge, [0])]
+        )
+        tester.check_not(["edge__ops_quantized_decomposed_quantize_per_tensor"])
+        tester.check_not(["edge__ops_quantized_decomposed_dequantize_per_tensor"])
diff --git a/backends/arm/test/runner_utils.py b/backends/arm/test/runner_utils.py
index 6996d53e91..577e114be0 100644
--- a/backends/arm/test/runner_utils.py
+++ b/backends/arm/test/runner_utils.py
@@ -65,16 +65,7 @@ def get_input_names(program: ExportedProgram) -> list[str]:
     Returns:
         A list of strings with the names of the model input.
     """
-    input_names = []
-
-    # E.g. bias and weights are 'placeholders' as well. This is used to
-    # get only the use inputs.
-    usr_inputs = program.graph_signature.user_inputs
-    for node in program.graph.nodes:
-        if node.op == "placeholder" and node.name in usr_inputs:
-            input_names.append(node.name)
-
-    return input_names
+    return [spec.arg.name for spec in program.graph_signature.input_specs]
 
 
 def get_input_quantization_params(
@@ -178,7 +169,7 @@ def _tosa_dispatch(self, lowered_backend_module: LoweredBackendModule, inputs):
         return run_tosa_graph(tosa_buffer, tosa_version, inputs)
 
     def __torch_function__(self, func, types, args=..., kwargs=None):
-        if isinstance(func, torch._higher_order_ops.executorch_call_delegate.ExecutorchCallDelegate):  # type: ignore
+        if func is torch._higher_order_ops.executorch_call_delegate:
             lowered_backend_module = cast(LoweredBackendModule, args[0])
             if lowered_backend_module.backend_id == "ArmBackend":
                 return self._tosa_dispatch(lowered_backend_module, args[1:])
@@ -334,13 +325,16 @@ def run_corstone(
 
 
 def prep_data_for_save(
-    data: torch.Tensor,
+    data,
     input_name: str,
     quant_param: Optional[QuantizationParams] = None,
 ):
-    data_np = np.array(data.detach(), order="C").astype(
-        torch_to_numpy_dtype_dict[data.dtype]
-    )
+    if isinstance(data, torch.Tensor):
+        data_np = np.array(data.detach(), order="C").astype(
+            torch_to_numpy_dtype_dict[data.dtype]
+        )
+    else:
+        data_np = np.array(data)
     if quant_param is not None:
         assert quant_param.node_name in input_name, (
             f"The quantization params name '{quant_param.node_name}' does not "
@@ -492,6 +486,47 @@ def _tosa_refmodel_loglevel(loglevel: int) -> str:
     return loglevel_map[clamped_logging_level]
 
 
+def corstone300_installed() -> bool:
+    cmd = ["FVP_Corstone_SSE-300_Ethos-U55", "--version"]
+    try:
+        _run_cmd(cmd, check=True)
+    except:
+        return False
+    return True
+
+
+def corstone320_installed() -> bool:
+    cmd = ["FVP_Corstone_SSE-320", "--version"]
+    try:
+        _run_cmd(cmd, check=True)
+    except:
+        return False
+    return True
+
+
+def get_elf_path(target_board):
+    elf_path = os.path.join(
+        "cmake-out",
+        f"arm_semihosting_executor_runner_{target_board}",
+        "arm_executor_runner",
+    )
+    if not os.path.exists(elf_path):
+        raise RuntimeError(
+            f"Did not find build arm_executor_runner in path {elf_path}, run setup_testing.sh?"
+        )
+    else:
+        return elf_path
+
+
+def arm_executor_runner_exists(target_board):
+    try:
+        get_elf_path(target_board)
+    except:
+        return False
+    else:
+        return True
+
+
 def run_tosa_graph(
     graph: TosaGraph,
     tosa_version: TosaSpecification,
diff --git a/backends/arm/test/tester/arm_tester.py b/backends/arm/test/tester/arm_tester.py
index 639cea5bae..11e7d86304 100644
--- a/backends/arm/test/tester/arm_tester.py
+++ b/backends/arm/test/tester/arm_tester.py
@@ -8,11 +8,11 @@
 import os
 from collections import Counter
 from pprint import pformat
-from typing import Iterable, List, Optional, Tuple, Union
+from typing import Callable, Iterable, List, Optional, Tuple, Type, Union
 
 import executorch.backends.xnnpack.test.tester.tester as tester
 
-import serializer.tosa_serializer as ts
+import serializer.tosa_serializer as ts  # type: ignore[import-untyped]
 
 import torch.fx
 import torch.utils._pytree as pytree
@@ -25,6 +25,7 @@
 )
 from executorch.backends.arm.test.runner_utils import (
     dbg_tosa_fb_to_json,
+    get_elf_path,
     get_output_nodes,
     get_output_quantization_params,
     get_target_board,
@@ -41,10 +42,18 @@
 
 from executorch.backends.xnnpack.test.tester import Tester
 from executorch.devtools.backend_debug import get_delegation_info
-from executorch.exir import EdgeCompileConfig, ExecutorchProgramManager
+from executorch.exir import (
+    EdgeCompileConfig,
+    EdgeProgramManager,
+    ExecutorchProgramManager,
+    ExportedProgram,
+)
+from executorch.exir.backend.backend_api import validation_disabled
 from executorch.exir.backend.compile_spec_schema import CompileSpec
 from executorch.exir.backend.partitioner import Partitioner
 from executorch.exir.lowered_backend_module import LoweredBackendModule
+from executorch.exir.pass_base import ExportPass
+from executorch.exir.program._program import _update_exported_program_graph_module
 
 from tabulate import tabulate
 from torch.export.graph_signature import ExportGraphSignature, InputSpec, OutputSpec
@@ -132,11 +141,8 @@ def run_artifact(self, inputs):
         inputs_flattened, _ = tree_flatten(inputs)
         intermediate_path = get_intermediate_path(self.compile_spec)
         target_board = get_target_board(self.compile_spec)
-        elf_path = os.path.join(
-            "cmake-out",
-            f"arm_semihosting_executor_runner_{target_board}",
-            "arm_executor_runner",
-        )
+        elf_path = get_elf_path(target_board)
+
         if not os.path.exists(elf_path):
             raise FileNotFoundError(
                 f"Did not find build arm_executor_runner in path {elf_path}, run setup_testing.sh?"
@@ -158,6 +164,44 @@ def run_artifact(self, inputs):
             return super().run_artifact(inputs)
 
 
+class RunPasses(tester.RunPasses):
+
+    def __init__(
+        self,
+        pass_list: Optional[List[Type[ExportPass]]] = None,
+        pass_functions: Optional[List[Callable]] = None,
+        passes_with_exported_program: Optional[List[Type[ExportPass]]] = None,
+    ):
+        """Passes are run in the order they are passed: first pass_list, second pass_functions,
+        and lastly passes_with_exported_program."""
+        self.pass_with_exported_program = passes_with_exported_program
+        super().__init__(pass_list, pass_functions)
+
+    def run(
+        self, artifact: Union[EdgeProgramManager, ExportedProgram], inputs=None
+    ) -> None:
+        if self.pass_with_exported_program is not None:
+            self.pass_functions = self.pass_functions or []  # type: ignore
+
+            # pass_function list from superclass expects functions that take in
+            # and return ExportedPrograms.
+            # Create a wrapper to fit pass_with_exported_program into this.
+            def wrap_ep_pass(ep_pass: Type[ExportPass]):
+                def wrapped_ep_pass(ep: ExportedProgram) -> ExportedProgram:
+                    pass_result = ep_pass(ep).call(ep.graph_module)
+                    with validation_disabled():
+                        return _update_exported_program_graph_module(
+                            ep, pass_result.graph_module
+                        )
+
+                return wrapped_ep_pass
+
+            self.pass_functions.extend(
+                [wrap_ep_pass(ep_pass) for ep_pass in self.pass_with_exported_program]
+            )
+        super().run(artifact, inputs)
+
+
 class InitialModel(tester.Stage):
     def __init__(self, model: torch.nn.Module):
         self.model = model
diff --git a/backends/arm/test/tester/test_pipeline.py b/backends/arm/test/tester/test_pipeline.py
new file mode 100644
index 0000000000..bc67783ddb
--- /dev/null
+++ b/backends/arm/test/tester/test_pipeline.py
@@ -0,0 +1,369 @@
+# Copyright 2025 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+from typing import Any, Callable, Generic, List, TypeVar
+
+import torch
+from executorch.backends.arm.test import common
+from executorch.backends.arm.test.tester.arm_tester import ArmTester
+from executorch.exir.backend.compile_spec_schema import CompileSpec
+
+
+logger = logging.getLogger(__name__)
+T = TypeVar("T")
+""" Generic type used for test data in the pipeline. Depends on which type the operator expects."""
+
+
+class BasePipelineMaker(Generic[T]):
+    """
+    The BasePiplineMaker defines a list of stages to be applied to a torch.nn.module for lowering it in the Arm backend. To be inherited and adjusted for particular targets.
+    Importantly, the pipeline list can be modified before running the pipeline to support various pipeline extensions and debugging usecases.
+
+    Attributes:
+        module: The module which the pipeline is applied to.
+        test_data: Data used for quantizing and testing the module.
+        aten_ops: Aten dialect ops expected to be found in the graph after export.
+        exir_ops: Exir dialect ops expected to be found in the graph after to_edge.
+        compile_spec: The compile spec used in the lowering process
+        use_edge_to_transform_and_lower: Selects betweeen two possible routes for lowering the module:
+                tester.to_edge_transform_and_lower()
+            or
+                tester.to_edge().check(exir_ops).partition()
+    """
+
+    class PipelineStage:
+        """
+        Helper class to store a pipeline stage as a function call + args for calling later on.
+
+        Attributes:
+            id: name of the function to be called, used for refering to stages in the pipeline
+            func: handle to the function to be called
+            args: args used when called
+            kwargs: kwargs used when called
+            is_called: keeps track of if the function has been called
+        """
+
+        def __init__(self, func, *args, **kwargs):
+            self.id: str = func.__name__
+            self.func: Callable = func
+            self.args = args
+            self.kwargs = kwargs
+            self.is_called = False
+
+        def __call__(self):
+            if not self.is_called:
+                self.func(*self.args, **self.kwargs)
+            else:
+                raise RuntimeError(f"{self.id} called twice.")
+            self.is_called = True
+
+        def update(self, *args, **kwargs):
+            if not self.is_called:
+                self.args = args
+                self.kwargs = kwargs
+            else:
+                raise RuntimeError(f"{self.id} args updated after being called.")
+
+    def __init__(
+        self,
+        module: torch.nn.Module,
+        test_data: T,
+        aten_ops: str | List[str],
+        exir_ops: str | List[str],
+        compile_spec: List[CompileSpec],
+        use_to_edge_transform_and_lower: bool = False,
+    ):
+
+        self.tester = ArmTester(
+            module, example_inputs=test_data, compile_spec=compile_spec
+        )
+
+        self.aten_ops = aten_ops if isinstance(aten_ops, list) else [aten_ops]
+        self.exir_ops = exir_ops if isinstance(exir_ops, list) else [exir_ops]
+        self.test_data = test_data
+        self._stages = []
+
+        self.add_stage(-1, self.tester.export)
+        self.add_stage(-1, self.tester.check, self.aten_ops)
+        if use_to_edge_transform_and_lower:
+            self.add_stage(-1, self.tester.to_edge_transform_and_lower)
+
+        else:
+            self.add_stage(-1, self.tester.to_edge)
+            self.add_stage(-1, self.tester.check, self.exir_ops)
+            self.add_stage(-1, self.tester.partition)
+        self.add_stage(-1, self.tester.check_not, self.exir_ops)
+        self.add_stage(
+            -1,
+            self.tester.check_count,
+            {"torch.ops.higher_order.executorch_call_delegate": 1},
+        )
+        self.add_stage(-1, self.tester.to_executorch)
+
+    def add_stage(self, pos: int, func: Callable, *args, **kwargs):
+        """Adds a stage defined by a function with arguments to the pipeline at index pos. Pos wraps around the list for negative values."""
+        pipeline_stage = self.PipelineStage(func, *args, **kwargs)
+        pipeline_length = len(self._stages)
+
+        if pos < 0:
+            pos = pipeline_length + (pos + 1)
+
+        if not -pipeline_length <= pos <= pipeline_length:
+            raise ValueError(
+                f"Pos must be between [-{pipeline_length}, {pipeline_length}]"
+            )
+
+        self._stages.insert(pos, pipeline_stage)
+
+        logger.debug(f"Added stage {func.__name__} to {type(self).__name__}")
+
+        return self
+
+    def pop_stage(self, pos: int):
+        """Removes and returns the stage at postion pos"""
+        return self._stages.pop(pos)
+
+    def find_pos(self, stage_id: str):
+        """Returns the position of the stage id. Note that this only finds the first stage with the given id, i.e. it should only be used with unique stages."""
+        for i, stage in enumerate(self._stages):
+            if stage.id == stage_id:
+                return i
+
+        raise Exception(f"Stage id {stage_id} not found in pipeline")
+
+    def add_stage_after(self, stage_id: str, func: Callable, *args, **kwargs):
+        """Adds a stage after the given stage id. Note that this only finds the first stage with the given id, i.e. it should only be used with unique stages."""
+        pos = self.find_pos(stage_id)
+        self.add_stage(pos + 1, func, *args, **kwargs)
+        return self
+
+    def dump_artifact(self, stage_id: str):
+        """Adds a dump_artifact stage after the given stage id. Note that this only finds the first stage with the given id, i.e. it should only be used with unique stages."""
+        self.add_stage_after(stage_id, self.tester.dump_artifact)
+        return self
+
+    def dump_operator_distribution(self, stage_id: str):
+        """Adds a dump_operator_distribution stage after the given stage id. Note that this only finds the first stage with the given id, i.e. it should only be used with unique stages."""
+        self.add_stage_after(stage_id, self.tester.dump_operator_distribution)
+        return self
+
+    def change_args(self, stage_id: str, *args, **kwargs):
+        """Updates the args to the given stage id. Note that this only finds the first stage with the given id, i.e. it should only be used with unique stages."""
+        pos = self.find_pos(stage_id)
+        pipeline_stage = self._stages[pos]
+        pipeline_stage.update(*args, **kwargs)
+        return self
+
+    def run(self):
+        """Calls each stage in order."""
+        stage_list = [stage.id for stage in self._stages]
+        logger.info(f"Running pipeline with stages:\n {stage_list}.")
+
+        for stage in self._stages:
+            try:
+                stage()
+            except Exception as e:
+                logger.error(f"\nFailure in stage <{stage.id}>: \n   {str(e)}")
+                raise e
+
+
+class TosaPipelineBI(BasePipelineMaker, Generic[T]):
+    """Lowers a graph to BI TOSA spec (with quantization) and tests it with the TOSA reference model."""
+
+    def __init__(
+        self,
+        module: torch.nn.Module,
+        test_data: Any,
+        aten_op: str,
+        exir_op: str,
+        tosa_version: str = "TOSA-0.80+BI",
+        use_to_edge_transform_and_lower: bool = False,
+    ):
+        compile_spec = common.get_tosa_compile_spec(
+            tosa_version,
+        )
+        super().__init__(
+            module,
+            test_data,
+            aten_op,
+            exir_op,
+            compile_spec,
+            use_to_edge_transform_and_lower,
+        )
+        self.add_stage(0, self.tester.quantize)
+        self.add_stage_after(
+            "quantize",
+            self.tester.check,
+            [
+                "torch.ops.quantized_decomposed.dequantize_per_tensor.default",
+                "torch.ops.quantized_decomposed.quantize_per_tensor.default",
+            ],
+        )
+
+        remove_quant_nodes_stage = (
+            "to_edge_transform_and_lower"
+            if use_to_edge_transform_and_lower
+            else "partition"
+        )
+        self.add_stage_after(
+            remove_quant_nodes_stage,
+            self.tester.check_not,
+            [
+                "torch.ops.quantized_decomposed.dequantize_per_tensor.default",
+                "torch.ops.quantized_decomposed.quantize_per_tensor.default",
+            ],
+        )
+
+        self.add_stage(
+            -1, self.tester.run_method_and_compare_outputs, inputs=self.test_data
+        )
+
+
+class TosaPipelineMI(BasePipelineMaker, Generic[T]):
+    """Lowers a graph to MI TOSA spec and tests it with the TOSA reference model"""
+
+    def __init__(
+        self,
+        module: torch.nn.Module,
+        test_data: Any,
+        aten_op: str,
+        exir_op: str,
+        tosa_version: str = "TOSA-0.80+MI",
+        use_to_edge_transform_and_lower: bool = False,
+    ):
+        compile_spec = common.get_tosa_compile_spec(
+            tosa_version,
+        )
+        super().__init__(
+            module,
+            test_data,
+            aten_op,
+            exir_op,
+            compile_spec,
+            use_to_edge_transform_and_lower,
+        )
+        self.add_stage_after(
+            "export",
+            self.tester.check_not,
+            [
+                "torch.ops.quantized_decomposed.dequantize_per_tensor.default",
+                "torch.ops.quantized_decomposed.quantize_per_tensor.default",
+            ],
+        )
+
+        self.add_stage(
+            -1, self.tester.run_method_and_compare_outputs, inputs=self.test_data
+        )
+
+
+class EthosU55PipelineBI(BasePipelineMaker, Generic[T]):
+    """Lowers a graph to u55 BI TOSA spec and tests it on the Corstone300 FVP, if run_on_fvp is true."""
+
+    def __init__(
+        self,
+        module: torch.nn.Module,
+        test_data: T,
+        aten_ops: str | List[str],
+        exir_ops: str | List[str],
+        run_on_fvp: bool = False,
+        use_to_edge_transform_and_lower: bool = False,
+    ):
+        compile_spec = common.get_u55_compile_spec()
+        super().__init__(
+            module,
+            test_data,
+            aten_ops,
+            exir_ops,
+            compile_spec,
+            use_to_edge_transform_and_lower,
+        )
+        self.add_stage(0, self.tester.quantize)
+        self.add_stage_after(
+            "quantize",
+            self.tester.check,
+            [
+                "torch.ops.quantized_decomposed.dequantize_per_tensor.default",
+                "torch.ops.quantized_decomposed.quantize_per_tensor.default",
+            ],
+        )
+
+        remove_quant_nodes_stage = (
+            "to_edge_transform_and_lower"
+            if use_to_edge_transform_and_lower
+            else "partition"
+        )
+        self.add_stage_after(
+            remove_quant_nodes_stage,
+            self.tester.check_not,
+            [
+                "torch.ops.quantized_decomposed.dequantize_per_tensor.default",
+                "torch.ops.quantized_decomposed.quantize_per_tensor.default",
+            ],
+        )
+
+        if run_on_fvp:
+            self.add_stage(-1, self.tester.serialize)
+            self.add_stage(
+                -1,
+                self.tester.run_method_and_compare_outputs,
+                qtol=1,
+                inputs=self.test_data,
+            )
+
+
+class EthosU85PipelineBI(BasePipelineMaker, Generic[T]):
+    """Lowers a graph to u85 BI TOSA spec and tests it on the Corstone320 FVP, if run_on_fvp is true."""
+
+    def __init__(
+        self,
+        module: torch.nn.Module,
+        test_data: T,
+        aten_ops: str | List[str],
+        exir_ops: str | List[str],
+        run_on_fvp: bool = False,
+        use_to_edge_transform_and_lower: bool = False,
+    ):
+        compile_spec = common.get_u85_compile_spec()
+        super().__init__(
+            module,
+            test_data,
+            aten_ops,
+            exir_ops,
+            compile_spec,
+            use_to_edge_transform_and_lower,
+        )
+        self.add_stage(0, self.tester.quantize)
+        self.add_stage_after(
+            "quantize",
+            self.tester.check,
+            [
+                "torch.ops.quantized_decomposed.dequantize_per_tensor.default",
+                "torch.ops.quantized_decomposed.quantize_per_tensor.default",
+            ],
+        )
+
+        remove_quant_nodes_stage = (
+            "to_edge_transform_and_lower"
+            if use_to_edge_transform_and_lower
+            else "partition"
+        )
+        self.add_stage_after(
+            remove_quant_nodes_stage,
+            self.tester.check_not,
+            [
+                "torch.ops.quantized_decomposed.dequantize_per_tensor.default",
+                "torch.ops.quantized_decomposed.quantize_per_tensor.default",
+            ],
+        )
+
+        if run_on_fvp:
+            self.add_stage(-1, self.tester.serialize)
+            self.add_stage(
+                -1,
+                self.tester.run_method_and_compare_outputs,
+                qtol=1,
+                inputs=self.test_data,
+            )
diff --git a/backends/arm/tosa_mapping.py b/backends/arm/tosa_mapping.py
index ec57bd5ce2..75d82f2a4b 100644
--- a/backends/arm/tosa_mapping.py
+++ b/backends/arm/tosa_mapping.py
@@ -1,4 +1,4 @@
-# Copyright 2023-2024 Arm Limited and/or its affiliates.
+# Copyright 2023-2025 Arm Limited and/or its affiliates.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
@@ -11,7 +11,7 @@
 # the standardised TOSA representation.
 #
 
-import serializer.tosa_serializer as ts
+import serializer.tosa_serializer as ts  # type: ignore
 import torch
 
 
diff --git a/backends/arm/tosa_quant_utils.py b/backends/arm/tosa_quant_utils.py
index 9869a08c0b..d53362cb36 100644
--- a/backends/arm/tosa_quant_utils.py
+++ b/backends/arm/tosa_quant_utils.py
@@ -10,9 +10,9 @@
 import math
 from typing import cast, NamedTuple
 
-import serializer.tosa_serializer as ts
+import serializer.tosa_serializer as ts  # type: ignore
 import torch.fx
-import tosa.Op as TosaOp
+import tosa.Op as TosaOp  # type: ignore
 from executorch.backends.arm.tosa_mapping import TosaArg
 from executorch.exir.dialects._ops import ops as exir_ops
 from serializer.tosa_serializer import TosaSerializerTensor
diff --git a/backends/arm/tosa_utils.py b/backends/arm/tosa_utils.py
index 9fefdbb3ff..15d29b5748 100644
--- a/backends/arm/tosa_utils.py
+++ b/backends/arm/tosa_utils.py
@@ -9,8 +9,7 @@
 import os
 from typing import Any
 
-import numpy as np
-import serializer.tosa_serializer as ts
+import serializer.tosa_serializer as ts  # type: ignore
 import torch
 from executorch.backends.arm.tosa_mapping import TosaArg
 
@@ -72,45 +71,6 @@ def dbg_fail(node, tosa_graph, path):
     raise RuntimeError("TOSA Internal Error on node, enable logging for further info.")
 
 
-# Helper function to match TOSA's broadcasting rank requirement
-# Ref: TOSA 0.80 specification - 1.9.3. Data Layouts from
-# https://www.mlplatform.org/tosa/tosa_spec.html
-def promote_shape(tosa_fb, arg, promoted_shape, out_dtype):
-    assert np.prod(arg.shape) == np.prod(promoted_shape), "Incompatible promoted shape"
-    reshape_res = tosa_fb.addIntermediate(promoted_shape, out_dtype)
-    attr = ts.TosaSerializerAttribute()
-    attr.ReshapeAttribute(promoted_shape)
-    tosa_fb.addOperator(TosaOp.Op().RESHAPE, [arg.name], [reshape_res.name], attr)
-    return reshape_res
-
-
-# Helper transpose function to match TOSA's shape requirements
-# E.g., TOSA 0.80 specification - 2.3.3 CONV2D shapes:
-# https://www.mlplatform.org/tosa/tosa_spec.html#_conv2d
-def transpose_helper(tosa_fb, input, new_order, out_dtype):
-    # Check new_order's length is equal to input rank
-    assert len(input.shape) == len(new_order), "Wrong shape order length"
-
-    # Check no duplications
-    assert len(set(new_order)) == len(new_order), "Contain duplicated dim numbers"
-
-    # Check all dims are valid
-    for idx in new_order:
-        if idx < 0:
-            assert True, "Negative dim number"
-        elif idx >= len(input.shape):
-            assert True, "Dim is greater than input rank"
-
-    input_shape_transpoed = [input.shape[i] for i in new_order]
-    attr = ts.TosaSerializerAttribute()
-    attr.TransposeAttribute(new_order)
-    input_transposed = tosa_fb.addIntermediate(input_shape_transpoed, out_dtype)
-    tosa_fb.addOperator(
-        TosaOp.Op().TRANSPOSE, [input.name], [input_transposed.name], attr
-    )
-    return input_transposed
-
-
 def getNodeArgs(node: Node) -> list[TosaArg]:
     return [TosaArg(arg) for arg in node.args]
 
diff --git a/backends/arm/util/arm_model_evaluator.py b/backends/arm/util/arm_model_evaluator.py
index f8aeab25ba..e13f9c4df0 100644
--- a/backends/arm/util/arm_model_evaluator.py
+++ b/backends/arm/util/arm_model_evaluator.py
@@ -1,4 +1,4 @@
-# Copyright 2024 Arm Limited and/or its affiliates.
+# Copyright 2024-2025 Arm Limited and/or its affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the BSD-style license found in the
@@ -17,7 +17,7 @@
 import torch
 from torch.nn.modules import Module
 from torch.utils.data import DataLoader
-from torchvision import datasets, transforms
+from torchvision import datasets, transforms  # type: ignore[import-untyped]
 
 
 # Logger for outputting progress for longer running evaluation
@@ -59,7 +59,7 @@ def __init__(
         if tosa_output_path:
             self.tosa_output_path = tosa_output_path
         else:
-            self.tosa_output_path = None
+            self.tosa_output_path = ""
 
     def get_model_error(self) -> defaultdict:
         """
@@ -104,7 +104,7 @@ def get_compression_ratio(self) -> float:
 
         return compression_ratio
 
-    def evaluate(self) -> dict[Any]:
+    def evaluate(self) -> dict[str, Any]:
         model_error_dict = self.get_model_error()
 
         output_metrics = {"name": self.model_name, "metrics": dict(model_error_dict)}
@@ -112,7 +112,7 @@ def evaluate(self) -> dict[Any]:
         if self.tosa_output_path:
             # We know output_metrics["metrics"] is list since we just defined it, safe to ignore.
             # pyre-ignore[16]
-            output_metrics["metrics"][
+            output_metrics["metrics"][  # type: ignore[index]
                 "compression_ratio"
             ] = self.get_compression_ratio()
 
diff --git a/backends/cadence/aot/compiler.py b/backends/cadence/aot/compiler.py
index bf4a274134..f9abe1c542 100644
--- a/backends/cadence/aot/compiler.py
+++ b/backends/cadence/aot/compiler.py
@@ -33,7 +33,6 @@
     ExecutorchProgramManager,
     to_edge,
 )
-from executorch.exir.dialects._ops import ops as exir_ops
 from executorch.exir.pass_base import PassResult
 from executorch.exir.passes import ToOutVarPass
 from executorch.exir.passes.sym_shape_eval_pass import HintBasedSymShapeEvalPass
@@ -57,6 +56,7 @@ def convert_pt2(
     model: torch.nn.Module,
     inputs: tuple[object, ...],
     quantizer: CadenceQuantizer,
+    dump_graphs: bool = False,
 ) -> torch.fx.GraphModule:
     """
     Prepare and convert a model using the given quantizer.
@@ -87,6 +87,10 @@ def convert_pt2(
         .module()
     )
 
+    if dump_graphs:
+        logging.info("Graph before quantization:")
+        logging.info(model_gm.graph.print_tabular())
+
     # Prepare
     prepared_model = prepare_pt2e(model_gm, quantizer)
 
@@ -96,6 +100,10 @@ def convert_pt2(
     # Convert
     converted_model = convert_pt2e(prepared_model)
 
+    if dump_graphs:
+        logging.info("Graph after quantization (before fusion):")
+        logging.info(model_gm.graph.print_tabular())
+
     return converted_model
 
 
@@ -128,6 +136,7 @@ def quantize_pt2(
     model: torch.nn.Module,
     inputs: tuple[object, ...],
     quantizer: Optional[CadenceQuantizer] = None,
+    dump_graphs: bool = False,
 ) -> torch.fx.GraphModule:
     """
     Prepare, convert and fuse the model using the given quantizer.
@@ -141,11 +150,15 @@ def quantize_pt2(
         quantizer = CadenceDefaultQuantizer()
 
     # Get converted graph module
-    converted_gm = convert_pt2(model, inputs, quantizer)
+    converted_gm = convert_pt2(model, inputs, quantizer, dump_graphs)
 
     # Get fused model
     fused_gm = fuse_pt2(converted_gm, quantizer)
 
+    if dump_graphs:
+        logging.info("Graph after quantization and fusion:")
+        logging.info(fused_gm.graph.print_tabular())
+
     return fused_gm
 
 
@@ -153,7 +166,6 @@ def quantize_pt2(
 def export_program(
     model: torch.nn.Module,
     inputs: tuple[object, ...],
-    dump_graphs: bool = False,
 ) -> ExportedProgram:
     assert isinstance(model, torch.nn.Module), "model should be an nn.Module"
 
@@ -163,10 +175,6 @@ def export_program(
     # Export the model and return it.
     expo_program = export(model, inputs, strict=True)
 
-    if dump_graphs:
-        logging.info("Exported graph:")
-        expo_program.graph_module.graph.print_tabular()
-
     return expo_program
 
 
@@ -180,13 +188,14 @@ def export_to_edge(
     assert isinstance(model, torch.nn.Module), "model should be an nn.Module"
 
     # Export the model into an ExportedProgram.
-    expo_program = export_program(model, inputs, dump_graphs=dump_graphs)
+    expo_program = export_program(model, inputs)
 
     # Call to_edge to convert the graph to edge IR.
     # Note: dim_order is skipped (https://github.com/pytorch/executorch/issues/3704)
     edge_prog_manager = to_edge(
         expo_program,
         compile_config=EdgeCompileConfig(
+            _skip_dim_order=True,
             # Allow specific non-core aten ops in the IR.
             _core_aten_ops_exception_list=[
                 torch.ops.aten._native_batch_norm_legit_functional.default,
@@ -194,18 +203,16 @@ def export_to_edge(
                 torch.ops.aten.linalg_vector_norm.default,
                 torch.ops.aten.unfold.default,
                 torch.ops.aten.angle.default,
-                # cadence replaced to_dim_order_copy with _to_copy for performance
-                # skip _to_copy op to get around of dim order check
-                # We should remove this op once cadence can support dim order
-                exir_ops.edge.aten._to_copy.default,
             ],
         ),
         constant_methods=constant_methods,
     )
 
     if dump_graphs:
-        logging.info("Edge graph:")
-        edge_prog_manager.exported_program().graph_module.graph.print_tabular()
+        logging.info("Graph after Edge lowering:")
+        logging.info(
+            edge_prog_manager.exported_program().graph_module.graph.print_tabular()
+        )
 
     return edge_prog_manager
 
diff --git a/backends/cadence/aot/replace_ops.py b/backends/cadence/aot/replace_ops.py
index cc304a226a..89ef821c56 100644
--- a/backends/cadence/aot/replace_ops.py
+++ b/backends/cadence/aot/replace_ops.py
@@ -11,7 +11,6 @@
 
 # pyre-unsafe
 
-import copy
 import math
 from operator import neg
 from typing import cast, Dict, Iterable, Sequence, Set, Tuple
@@ -36,12 +35,7 @@
 from executorch.backends.cadence.aot.utils import get_edge_overload_packet
 from executorch.exir.dialects._ops import ops as exir_ops
 from executorch.exir.dialects.edge._ops import EdgeOpOverload, EdgeOpOverloadPacket
-from executorch.exir.dim_order_utils import get_memory_format
 from executorch.exir.pass_base import ExportPass, NodeMetadata, PassResult, ProxyValue
-from executorch.exir.passes.dim_order_ops_registry import (
-    DimOrderOpsMap,
-    MemoryFormatOpsMap,
-)
 from torch._subclasses import FakeTensor
 from torch.fx.node import Argument
 
@@ -1805,72 +1799,6 @@ def call_operator(
         )
 
 
-@register_cadence_pass(CadencePassAttribute(opt_level=0))
-class ReplaceToDimOrderCopyWithToCopyPass(ExportPass):
-    """
-    dim_order_ops::to_dim_order_copy is not supported, so this is an opt_level=0 pass.
-    If the dim order is sequential, we don't need the extra work with strides and
-    can just use to_copy.
-    """
-
-    def call_operator(
-        self,
-        op,
-        args: Tuple[Argument, ...],
-        kwargs: Dict[str, Argument],
-        meta: NodeMetadata,
-    ) -> ProxyValue:
-        if op not in DimOrderOpsMap:
-            return super().call_operator(op, args, kwargs, meta)
-
-        # new kwargs with dim_order, and no memory_format for the new op
-        nkwargs = dict(copy.deepcopy(kwargs))  # orig kwargs are immutable
-
-        ndim = None
-
-        # can always get the shape, assuming rank is specialized
-
-        # pyre-ignore[16]: `None` has no attribute `to_tensor`
-        if isinstance(args[0], ProxyValue) and args[0].is_tensor():
-            # pyre-ignore[16]: `None` has no attribute `to_tensor`
-            ndim = args[0].to_tensor().dim()
-        elif isinstance(args[0], torch.Tensor):
-            # pyre-ignore[16]: `None` has no attribute `dim`
-            ndim = args[0].dim()
-        elif isinstance(args[0], torch.fx.immutable_collections.immutable_list):
-            # pyre-ignore[6]: Incompatible parameter type
-            ndim = len(args[0])
-        else:
-            assert 0, f"Expecting a Tensor or a ProxyValue but got {type(args[0])}"
-
-        # get the "to" memory format for the EdgeOp
-        contiguous_dim_order = list(range(ndim))
-        dim_order = nkwargs.pop("dim_order", None)
-
-        # Cadence only supports contiguous memory format
-        assert (
-            dim_order is None
-            # pyre-ignore[6]: Incompatible parameter type
-            or len(dim_order) == 0
-            or dim_order == contiguous_dim_order
-        ), "Expected dim order in congituous or prevserve memory format, but got {}".format(
-            dim_order
-        )
-
-        # bring back memory format
-        # pyre-ignore[6]: Incompatible parameter type
-        nkwargs["memory_format"] = get_memory_format(dim_order)
-
-        memory_format_op = MemoryFormatOpsMap[op]
-
-        return super().call_operator(
-            memory_format_op,
-            args,
-            nkwargs,
-            meta,
-        )
-
-
 @register_cadence_pass(CadencePassAttribute(opt_level=0))
 class ReplaceFullLikeWithFullPass(ExportPass):
     """
@@ -2180,5 +2108,4 @@ class CadenceReplaceOpsInGraph:
         ReplaceSingleElementTensorArgumentsFromFullOpWithScalarPass,
         ReplaceAtenAvgPoolWithJarvisAvgPoolPass,
         ReplaceAtenLinalgVectorNormWithCadenceLinalgVectorNormPass,
-        ReplaceToDimOrderCopyWithToCopyPass,
     ]
diff --git a/backends/cadence/fusion_g3/operators/op_add.cpp b/backends/cadence/fusion_g3/operators/op_add.cpp
index d51fee5338..409c4cc510 100644
--- a/backends/cadence/fusion_g3/operators/op_add.cpp
+++ b/backends/cadence/fusion_g3/operators/op_add.cpp
@@ -35,21 +35,7 @@ Tensor& add_out(
     const Tensor& b,
     const Scalar& alpha,
     Tensor& out) {
-  // Common Dtype
-  ScalarType common_type =
-      executorch::runtime::promoteTypes(a.scalar_type(), b.scalar_type());
-
 #ifdef OP_ARG_CHECK
-  // Check Common Dtype
-  ET_KERNEL_CHECK(
-      ctx,
-      (canCast(common_type, out.scalar_type()) &&
-       torch::executor::check_alpha_type(
-           torch::executor::native::utils::get_scalar_dtype(alpha),
-           common_type)),
-      InvalidArgument,
-      out);
-
   // Check Dim Order
   ET_KERNEL_CHECK(
       ctx,
@@ -65,10 +51,6 @@ Tensor& add_out(
       out);
 #endif
 
-  // Compute Dtype
-  ScalarType compute_type =
-      torch::executor::native::utils::get_compute_type(common_type);
-
   static constexpr const char op_name[] = "add.out";
 
   int kTensorDimensionLimit = 5;
@@ -77,12 +59,12 @@ Tensor& add_out(
   int inp2_shape[kTensorDimensionLimit];
   int out_shape[kTensorDimensionLimit];
 
-  bool broadcast = 0;
+  bool broadcast = false;
 
   int max_dim = a.dim() > b.dim() ? a.dim() : b.dim();
   max_dim = out.dim() > max_dim ? out.dim() : max_dim;
 
-  bool optimized = 1;
+  bool optimized = true;
 
   /* Added change to work with input dimensions more than 5 */
   for (int i = 0; i < max_dim; i++) {
@@ -109,15 +91,19 @@ Tensor& add_out(
   for (int i = 0; i < out.dim(); i++) {
     if (((inp1_shape[i]) != (out_shape[i])) ||
         ((inp2_shape[i]) != (out_shape[i]))) {
-      broadcast = 1;
+      broadcast = true;
     }
   }
 
-  if ((broadcast == 1) && (max_dim > kTensorDimensionLimit)) {
-    optimized = 0;
+  if (((broadcast) && (max_dim > kTensorDimensionLimit)) ||
+      (!(((a.scalar_type() == ScalarType::Int) ||
+          (a.scalar_type() == ScalarType::Float)) &&
+         (a.scalar_type() == b.scalar_type()) &&
+         (a.scalar_type() == out.scalar_type())))) {
+    optimized = false;
   }
 
-  if ((compute_type == ScalarType::Int) && (optimized)) {
+  if ((a.scalar_type() == ScalarType::Int) && (optimized)) {
     const int* const inp1_data = a.const_data_ptr<int>();
     const int* const inp2_data = b.const_data_ptr<int>();
     int* const out_data = out.mutable_data_ptr<int>();
@@ -169,7 +155,7 @@ Tensor& add_out(
           alpha_val,
           out.numel());
     }
-  } else if ((compute_type == ScalarType::Float) && (optimized)) {
+  } else if ((a.scalar_type() == ScalarType::Float) && (optimized)) {
     const float* const inp1_data = a.const_data_ptr<float>();
     const float* const inp2_data = b.const_data_ptr<float>();
     float* const out_data = out.mutable_data_ptr<float>();
@@ -222,6 +208,23 @@ Tensor& add_out(
           out.numel());
     }
   } else {
+    // Common Dtype
+    ScalarType common_type =
+        executorch::runtime::promoteTypes(a.scalar_type(), b.scalar_type());
+    // Compute Dtype
+    ScalarType compute_type =
+        torch::executor::native::utils::get_compute_type(common_type);
+
+    // Check Common Dtype
+    ET_KERNEL_CHECK(
+        ctx,
+        (canCast(common_type, out.scalar_type()) &&
+         torch::executor::check_alpha_type(
+             torch::executor::native::utils::get_scalar_dtype(alpha),
+             common_type)),
+        InvalidArgument,
+        out);
+
     ET_SWITCH_REALB_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() {
       const CTYPE_COMPUTE val_alpha =
           torch::executor::native::utils::scalar_to<CTYPE_COMPUTE>(alpha);
@@ -249,22 +252,7 @@ Tensor& add_scalar_out(
     const Scalar& b,
     const Scalar& alpha,
     Tensor& out) {
-  // Common Dtype
-  ScalarType common_type =
-      torch::executor::native::utils::promote_type_with_scalar(
-          a.scalar_type(), b);
-
 #ifdef OP_ARG_CHECK
-  // Check Common Dtype
-  ET_KERNEL_CHECK(
-      ctx,
-      (common_type == out.scalar_type() &&
-       torch::executor::check_alpha_type(
-           torch::executor::native::utils::get_scalar_dtype(alpha),
-           common_type)),
-      InvalidArgument,
-      out);
-
   // Check Dim Order
   ET_KERNEL_CHECK(
       ctx,
@@ -279,14 +267,23 @@ Tensor& add_scalar_out(
       InvalidArgument,
       out);
 #endif
-  // Compute Dtype
-  ScalarType compute_type =
-      torch::executor::native::utils::get_compute_type(common_type);
 
   // @lint-ignore CLANGTIDY facebook-hte-CArray
   static constexpr const char op_name[] = "add.Scalar_out";
 
-  if (compute_type == ScalarType::Int) {
+  bool optimized = true;
+
+  if (!(((a.scalar_type() == ScalarType::Int) ||
+         (a.scalar_type() == ScalarType::Float)) &&
+        (a.scalar_type() == out.scalar_type()))) {
+    optimized = false;
+  }
+
+  if ((b.isFloatingPoint()) && (a.scalar_type() == ScalarType::Int)) {
+    optimized = false;
+  }
+
+  if ((a.scalar_type() == ScalarType::Int) && (optimized)) {
     const int* const inp1_data = a.const_data_ptr<int>();
     int inp2_val;
     torch::executor::native::utils::extract_scalar(b, &inp2_val);
@@ -306,7 +303,7 @@ Tensor& add_scalar_out(
         alpha_val,
         out.numel());
 
-  } else if (compute_type == ScalarType::Float) {
+  } else if ((a.scalar_type() == ScalarType::Float) && (optimized)) {
     const float* const inp1_data = a.const_data_ptr<float>();
     float inp2_val;
     torch::executor::native::utils::extract_scalar(b, &inp2_val);
@@ -327,6 +324,24 @@ Tensor& add_scalar_out(
         out.numel());
 
   } else {
+    // Common Dtype
+    ScalarType common_type =
+        torch::executor::native::utils::promote_type_with_scalar(
+            a.scalar_type(), b);
+    // Compute Dtype
+    ScalarType compute_type =
+        torch::executor::native::utils::get_compute_type(common_type);
+
+    // Check Common Dtype
+    ET_KERNEL_CHECK(
+        ctx,
+        (common_type == out.scalar_type() &&
+         torch::executor::check_alpha_type(
+             torch::executor::native::utils::get_scalar_dtype(alpha),
+             common_type)),
+        InvalidArgument,
+        out);
+
     ET_SWITCH_REALB_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() {
       torch::executor::native::utils::
           apply_unitensor_elementwise_fn<CTYPE_COMPUTE, op_name>(
diff --git a/backends/cadence/fusion_g3/operators/op_cat.cpp b/backends/cadence/fusion_g3/operators/op_cat.cpp
index 74fd96a212..84224b37b0 100644
--- a/backends/cadence/fusion_g3/operators/op_cat.cpp
+++ b/backends/cadence/fusion_g3/operators/op_cat.cpp
@@ -46,11 +46,6 @@ Tensor& cat_out(
   int kTensorDimensionLimit = executorch::runtime::kTensorDimensionLimit;
 
 #ifdef OP_ARG_CHECK
-  ET_KERNEL_CHECK(
-      ctx,
-      torch::executor::check_cat_args(tensors, dim, out),
-      InvalidArgument,
-      out);
 
   Tensor::SizesType expected_out_size[kTensorDimensionLimit];
   size_t expected_out_dim = 0;
@@ -106,7 +101,16 @@ Tensor& cat_out(
     out_shapes[i] = out_size[i];
   }
 
-  if ((out.scalar_type() == ScalarType::Int) ||
+  bool optimized = true;
+
+  for (int i = 0; i < tensors.size(); i++) {
+    if (out.scalar_type() != tensors[i].scalar_type()) {
+      optimized = false;
+      break;
+    }
+  }
+
+  if ((optimized) && (out.scalar_type() == ScalarType::Int) ||
       (out.scalar_type() == ScalarType::Short) ||
       (out.scalar_type() == ScalarType::Char) ||
       (out.scalar_type() == ScalarType::UInt32) ||
@@ -125,6 +129,12 @@ Tensor& cat_out(
         (int)dim,
         get_element_size(out.scalar_type()));
   } else {
+    ET_KERNEL_CHECK(
+        ctx,
+        torch::executor::check_cat_args(tensors, dim, out),
+        InvalidArgument,
+        out);
+
     const size_t outer = executorch::runtime::getLeadingDims(out, dim);
     const size_t dim_stride = executorch::runtime::getTrailingDims(out, dim);
     const size_t ninputs = tensors.size();
diff --git a/backends/cadence/fusion_g3/operators/op_dequantize.cpp b/backends/cadence/fusion_g3/operators/op_dequantize.cpp
index 3e0235170b..dd9d4f2a51 100644
--- a/backends/cadence/fusion_g3/operators/op_dequantize.cpp
+++ b/backends/cadence/fusion_g3/operators/op_dequantize.cpp
@@ -117,7 +117,7 @@ Tensor& dequantize_impl(
         }
       }
     } else {
-      if (*zero_point_data != 0) // tesor
+      if (*zero_point_data != 0) // tensor
       {
         is_asym_dequant |= 1;
       }
@@ -125,8 +125,14 @@ Tensor& dequantize_impl(
   }
   float* out_data = out.mutable_data_ptr<float>();
 
+  bool optimized = true;
+
+  if (out.scalar_type() != ScalarType::Float) {
+    optimized = false;
+  }
+
   if (is_asym_dequant) {
-    if (input.scalar_type() == ScalarType::Byte) {
+    if ((input.scalar_type() == ScalarType::Byte) && (optimized)) {
       const uint8_t* input_data = input.const_data_ptr<uint8_t>();
       XT_KERNEL_CHECK(
           ctx,
@@ -139,7 +145,7 @@ Tensor& dequantize_impl(
           axis,
           zero_point_data,
           scale_data);
-    } else if (input.scalar_type() == ScalarType::Char) {
+    } else if ((input.scalar_type() == ScalarType::Char) && (optimized)) {
       const int8_t* input_data = input.const_data_ptr<int8_t>();
       XT_KERNEL_CHECK(
           ctx,
@@ -152,7 +158,7 @@ Tensor& dequantize_impl(
           axis,
           zero_point_data,
           scale_data);
-    } else if (input.scalar_type() == ScalarType::UInt16) {
+    } else if ((input.scalar_type() == ScalarType::UInt16) && (optimized)) {
       const uint16_t* input_data = input.const_data_ptr<uint16_t>();
       XT_KERNEL_CHECK(
           ctx,
@@ -165,7 +171,7 @@ Tensor& dequantize_impl(
           axis,
           zero_point_data,
           scale_data);
-    } else if (input.scalar_type() == ScalarType::Short) {
+    } else if ((input.scalar_type() == ScalarType::Short) && (optimized)) {
       const int16_t* input_data = input.const_data_ptr<int16_t>();
       XT_KERNEL_CHECK(
           ctx,
@@ -178,7 +184,7 @@ Tensor& dequantize_impl(
           axis,
           zero_point_data,
           scale_data);
-    } else if (input.scalar_type() == (ScalarType)Bits4u) {
+    } else if ((input.scalar_type() == (ScalarType)Bits4u) && (optimized)) {
       const uint8_t* input_data = input.const_data_ptr<uint8_t>();
       XT_KERNEL_CHECK(
           ctx,
@@ -191,7 +197,7 @@ Tensor& dequantize_impl(
           axis,
           zero_point_data,
           scale_data);
-    } else if (input.scalar_type() == (ScalarType)Bits4) {
+    } else if ((input.scalar_type() == (ScalarType)Bits4) && (optimized)) {
       const int8_t* input_data = input.const_data_ptr<int8_t>();
       XT_KERNEL_CHECK(
           ctx,
@@ -338,7 +344,7 @@ Tensor& dequantize_impl(
       }
     }
   } else {
-    if (input.scalar_type() == ScalarType::Byte) {
+    if ((input.scalar_type() == ScalarType::Byte) && (optimized)) {
       const uint8_t* input_data = input.const_data_ptr<uint8_t>();
       XT_KERNEL_CHECK(
           ctx,
@@ -350,7 +356,7 @@ Tensor& dequantize_impl(
           input.dim(),
           axis,
           scale_data);
-    } else if (input.scalar_type() == ScalarType::Char) {
+    } else if ((input.scalar_type() == ScalarType::Char) && (optimized)) {
       const int8_t* input_data = input.const_data_ptr<int8_t>();
       XT_KERNEL_CHECK(
           ctx,
@@ -362,7 +368,7 @@ Tensor& dequantize_impl(
           input.dim(),
           axis,
           scale_data);
-    } else if (input.scalar_type() == ScalarType::UInt16) {
+    } else if ((input.scalar_type() == ScalarType::UInt16) && (optimized)) {
       const uint16_t* input_data = input.const_data_ptr<uint16_t>();
       XT_KERNEL_CHECK(
           ctx,
@@ -374,7 +380,7 @@ Tensor& dequantize_impl(
           input.dim(),
           axis,
           scale_data);
-    } else if (input.scalar_type() == ScalarType::Short) {
+    } else if ((input.scalar_type() == ScalarType::Short) && (optimized)) {
       const int16_t* input_data = input.const_data_ptr<int16_t>();
       XT_KERNEL_CHECK(
           ctx,
@@ -386,7 +392,7 @@ Tensor& dequantize_impl(
           input.dim(),
           axis,
           scale_data);
-    } else if (input.scalar_type() == (ScalarType)Bits4u) {
+    } else if ((input.scalar_type() == (ScalarType)Bits4u) && (optimized)) {
       const uint8_t* input_data = input.const_data_ptr<uint8_t>();
       XT_KERNEL_CHECK(
           ctx,
@@ -398,7 +404,7 @@ Tensor& dequantize_impl(
           input.dim(),
           axis,
           scale_data);
-    } else if (input.scalar_type() == (ScalarType)Bits4) {
+    } else if ((input.scalar_type() == (ScalarType)Bits4) && (optimized)) {
       const int8_t* input_data = input.const_data_ptr<int8_t>();
       XT_KERNEL_CHECK(
           ctx,
diff --git a/backends/cadence/fusion_g3/operators/op_div.cpp b/backends/cadence/fusion_g3/operators/op_div.cpp
index 1461f643a8..85e5da4276 100644
--- a/backends/cadence/fusion_g3/operators/op_div.cpp
+++ b/backends/cadence/fusion_g3/operators/op_div.cpp
@@ -54,10 +54,6 @@ Tensor& div_out(
     const Tensor& a,
     const Tensor& b,
     Tensor& out) {
-  // Common Dtype
-  ScalarType common_type =
-      executorch::runtime::promoteTypes(a.scalar_type(), b.scalar_type());
-
 #ifdef OP_ARG_CHECK
   // Check Dim Order
   ET_KERNEL_CHECK(
@@ -73,11 +69,6 @@ Tensor& div_out(
       InvalidArgument,
       out);
 #endif
-
-  // Compute Dtype
-  ScalarType compute_type =
-      torch::executor::native::utils::get_compute_type(common_type);
-
   // @lint-ignore CLANGTIDY facebook-hte-CArray
   static constexpr const char op_name[] = "div.out";
 
@@ -87,12 +78,12 @@ Tensor& div_out(
   int inp2_shape[kTensorDimensionLimit];
   int out_shape[kTensorDimensionLimit];
 
-  bool broadcast = 0;
+  bool broadcast = false;
 
   int max_dim = a.dim() > b.dim() ? a.dim() : b.dim();
   max_dim = out.dim() > max_dim ? out.dim() : max_dim;
 
-  bool optimized = 1;
+  bool optimized = true;
 
   for (int i = 0; i < max_dim; i++) {
     out_shape[i] = 1;
@@ -118,15 +109,19 @@ Tensor& div_out(
   for (int i = 0; i < out.dim(); i++) {
     if (((inp1_shape[i]) != (out_shape[i])) ||
         ((inp2_shape[i]) != (out_shape[i]))) {
-      broadcast = 1;
+      broadcast = true;
     }
   }
 
-  if ((broadcast == 1) && (max_dim > kTensorDimensionLimit)) {
-    optimized = 0;
+  if (((broadcast) && (max_dim > kTensorDimensionLimit)) ||
+      (!(((a.scalar_type() == ScalarType::Int) ||
+          (a.scalar_type() == ScalarType::Float)) &&
+         (a.scalar_type() == b.scalar_type()) &&
+         (out.scalar_type() == ScalarType::Float)))) {
+    optimized = false;
   }
 
-  if ((compute_type == ScalarType::Int) && (optimized)) {
+  if ((a.scalar_type() == ScalarType::Int) && (optimized)) {
     const int* const inp1_data = a.const_data_ptr<int>();
     const int* const inp2_data = b.const_data_ptr<int>();
     float* const out_data = out.mutable_data_ptr<float>();
@@ -162,7 +157,7 @@ Tensor& div_out(
           inp2_data,
           out.numel());
     }
-  } else if ((compute_type == ScalarType::Float) && (optimized)) {
+  } else if ((a.scalar_type() == ScalarType::Float) && (optimized)) {
     const float* const inp1_data = a.const_data_ptr<float>();
     const float* const inp2_data = b.const_data_ptr<float>();
     float* const out_data = out.mutable_data_ptr<float>();
@@ -244,19 +239,7 @@ Tensor& div_out_mode(
   ET_KERNEL_CHECK(
       ctx, mode_val == "trunc" || mode_val == "floor", InvalidArgument, out);
 
-  // Common Dtype
-  ScalarType common_type =
-      executorch::runtime::promoteTypes(a.scalar_type(), b.scalar_type());
-
 #ifdef OP_ARG_CHECK
-  // Check Common Dtype
-  ET_KERNEL_CHECK(
-      ctx,
-      (canCast(common_type, out.scalar_type()) &&
-       common_type != ScalarType::Bool),
-      InvalidArgument,
-      out);
-
   // Check Dim Order
   ET_KERNEL_CHECK(
       ctx,
@@ -271,9 +254,6 @@ Tensor& div_out_mode(
       InvalidArgument,
       out);
 #endif
-  // Compute Dtype
-  ScalarType compute_type =
-      torch::executor::native::utils::get_compute_type(common_type);
 
   // @lint-ignore CLANGTIDY facebook-hte-CArray
   static constexpr const char op_name[] = "div.out_mode";
@@ -287,12 +267,12 @@ Tensor& div_out_mode(
   int inp2_shape[kTensorDimensionLimit];
   int out_shape[kTensorDimensionLimit];
 
-  bool broadcast = 0;
+  bool broadcast = false;
 
   int max_dim = a.dim() > b.dim() ? a.dim() : b.dim();
   max_dim = out.dim() > max_dim ? out.dim() : max_dim;
 
-  bool optimized = 1;
+  bool optimized = true;
 
   for (int i = 0; i < max_dim; i++) {
     out_shape[i] = 1;
@@ -318,17 +298,21 @@ Tensor& div_out_mode(
   for (int i = 0; i < out.dim(); i++) {
     if (((inp1_shape[i]) != (out_shape[i])) ||
         ((inp2_shape[i]) != (out_shape[i]))) {
-      broadcast = 1;
+      broadcast = true;
     }
   }
 
-  if ((broadcast == 1) && (max_dim > kTensorDimensionLimit)) {
-    optimized = 0;
+  if (((broadcast) && (max_dim > kTensorDimensionLimit)) ||
+      (!(((a.scalar_type() == ScalarType::Int) ||
+          (a.scalar_type() == ScalarType::Float)) &&
+         (a.scalar_type() == b.scalar_type()) &&
+         (a.scalar_type() == out.scalar_type())))) {
+    optimized = false;
   }
 
   int mode_value = (mode_val == "trunc") ? 1 : 2;
 
-  if ((compute_type == ScalarType::Int) && (optimized)) {
+  if ((a.scalar_type() == ScalarType::Int) && (optimized)) {
     const int* const inp1_data = a.const_data_ptr<int>();
     const int* const inp2_data = b.const_data_ptr<int>();
     int* const out_data = out.mutable_data_ptr<int>();
@@ -367,7 +351,7 @@ Tensor& div_out_mode(
           mode_value,
           out.numel());
     }
-  } else if ((compute_type == ScalarType::Float) && (optimized)) {
+  } else if ((a.scalar_type() == ScalarType::Float) && (optimized)) {
     const float* const inp1_data = a.const_data_ptr<float>();
     const float* const inp2_data = b.const_data_ptr<float>();
     float* const out_data = out.mutable_data_ptr<float>();
@@ -407,6 +391,21 @@ Tensor& div_out_mode(
           out.numel());
     }
   } else {
+    // Common Dtype
+    ScalarType common_type =
+        executorch::runtime::promoteTypes(a.scalar_type(), b.scalar_type());
+    // Compute Dtype
+    ScalarType compute_type =
+        torch::executor::native::utils::get_compute_type(common_type);
+
+    // Check Common Dtype
+    ET_KERNEL_CHECK(
+        ctx,
+        (canCast(common_type, out.scalar_type()) &&
+         common_type != ScalarType::Bool),
+        InvalidArgument,
+        out);
+
     ET_SWITCH_REAL_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() {
       torch::executor::native::utils::
           apply_bitensor_elementwise_fn<CTYPE_COMPUTE, op_name>(
@@ -456,15 +455,7 @@ Tensor& div_scalar_out(
     const Tensor& a,
     const Scalar& b,
     Tensor& out) {
-  // Common Dtype
-  ScalarType common_type =
-      torch::executor::native::utils::promote_type_with_scalar(
-          a.scalar_type(), b);
-
 #ifdef OP_ARG_CHECK
-  // Check Common Dtype
-  ET_KERNEL_CHECK(ctx, common_type == out.scalar_type(), InvalidArgument, out);
-
   // Check Dim Order
   ET_KERNEL_CHECK(
       ctx,
@@ -480,14 +471,22 @@ Tensor& div_scalar_out(
       out);
 #endif
 
-  // Compute Dtype
-  ScalarType compute_type =
-      torch::executor::native::utils::get_compute_type(common_type);
+  bool optimized = true;
+
+  if (!(((a.scalar_type() == ScalarType::Int) ||
+         (a.scalar_type() == ScalarType::Float)) &&
+        (out.scalar_type() == ScalarType::Float))) {
+    optimized = false;
+  }
+
+  if ((b.isFloatingPoint()) && (a.scalar_type() == ScalarType::Int)) {
+    optimized = false;
+  }
 
   // @lint-ignore CLANGTIDY facebook-hte-CArray
   static constexpr const char op_name[] = "div.Scalar_out";
 
-  if (compute_type == ScalarType::Int) {
+  if ((a.scalar_type() == ScalarType::Int) && (optimized)) {
     const int* const inp1_data = a.const_data_ptr<int>();
     int inp2_val;
     torch::executor::native::utils::extract_scalar(b, &inp2_val);
@@ -502,7 +501,7 @@ Tensor& div_scalar_out(
         inp1_data,
         inp2_val,
         out.numel());
-  } else if (compute_type == ScalarType::Float) {
+  } else if ((a.scalar_type() == ScalarType::Float) && (optimized)) {
     const float* const inp1_data = a.const_data_ptr<float>();
     float inp2_val;
     torch::executor::native::utils::extract_scalar(b, &inp2_val);
@@ -526,6 +525,11 @@ Tensor& div_scalar_out(
         : ScalarType::Float;
     ScalarType compute_type =
         torch::executor::native::utils::get_compute_type(common_type);
+
+    // Check Common Dtype
+    ET_KERNEL_CHECK(
+        ctx, common_type == out.scalar_type(), InvalidArgument, out);
+
     ET_SWITCH_FLOAT_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() {
       const CTYPE_COMPUTE val_b =
           torch::executor::native::utils::scalar_to<CTYPE_COMPUTE>(b);
@@ -560,29 +564,7 @@ Tensor& div_scalar_mode_out(
   ET_KERNEL_CHECK(
       ctx, mode_val == "trunc" || mode_val == "floor", InvalidArgument, out);
 
-  // Common Dtype
-  ScalarType common_type =
-      torch::executor::native::utils::promote_type_with_scalar(
-          a.scalar_type(), b);
-
 #ifdef OP_ARG_CHECK
-  // Check Common Dtype
-  ET_KERNEL_CHECK(
-      ctx,
-      (canCast(common_type, out.scalar_type()) &&
-       common_type != ScalarType::Bool),
-      InvalidArgument,
-      out);
-
-  // Check for intergral division by zero
-  ET_KERNEL_CHECK_MSG(
-      ctx,
-      !(executorch::runtime::isIntegralType(common_type, true) &&
-        torch::executor::native::utils::scalar_to<double>(b) == 0),
-      InvalidArgument,
-      out,
-      "Div mode operation encountered integer division by zero");
-
   // Check Dim Order
   ET_KERNEL_CHECK(
       ctx,
@@ -598,18 +580,26 @@ Tensor& div_scalar_mode_out(
       out);
 #endif
 
-  // Compute Dtype
-  ScalarType compute_type =
-      torch::executor::native::utils::get_compute_type(common_type);
-
   const bool mode_is_trunc = mode_val == "trunc";
 
+  bool optimized = true;
+
+  if (!(((a.scalar_type() == ScalarType::Int) ||
+         (a.scalar_type() == ScalarType::Float)) &&
+        (a.scalar_type() == out.scalar_type()))) {
+    optimized = false;
+  }
+
+  if ((b.isFloatingPoint()) && (a.scalar_type() == ScalarType::Int)) {
+    optimized = false;
+  }
+
   // @lint-ignore CLANGTIDY facebook-hte-CArray
   static constexpr const char op_name[] = "div.Scalar_mode_out";
 
   int mode_value = (mode_val == "trunc") ? 1 : 2;
 
-  if (compute_type == ScalarType::Int) {
+  if ((a.scalar_type() == ScalarType::Int) && (optimized)) {
     const int* const inp1_data = a.const_data_ptr<int>();
     int inp2_val;
     torch::executor::native::utils::extract_scalar(b, &inp2_val);
@@ -625,7 +615,7 @@ Tensor& div_scalar_mode_out(
         inp2_val,
         mode_value,
         out.numel());
-  } else if (compute_type == ScalarType::Float) {
+  } else if ((a.scalar_type() == ScalarType::Float) && (optimized)) {
     const float* const inp1_data = a.const_data_ptr<float>();
     float inp2_val;
     torch::executor::native::utils::extract_scalar(b, &inp2_val);
@@ -642,6 +632,31 @@ Tensor& div_scalar_mode_out(
         mode_value,
         out.numel());
   } else {
+    // Common Dtype
+    ScalarType common_type =
+        torch::executor::native::utils::promote_type_with_scalar(
+            a.scalar_type(), b);
+    // Compute Dtype
+    ScalarType compute_type =
+        torch::executor::native::utils::get_compute_type(common_type);
+
+    // Check Common Dtype
+    ET_KERNEL_CHECK(
+        ctx,
+        (canCast(common_type, out.scalar_type()) &&
+         common_type != ScalarType::Bool),
+        InvalidArgument,
+        out);
+
+    // Check for intergral division by zero
+    ET_KERNEL_CHECK_MSG(
+        ctx,
+        !(executorch::runtime::isIntegralType(common_type, true) &&
+          torch::executor::native::utils::scalar_to<double>(b) == 0),
+        InvalidArgument,
+        out,
+        "Div mode operation encountered integer division by zero");
+
     ET_SWITCH_REAL_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() {
       const CTYPE_COMPUTE val_b =
           torch::executor::native::utils::scalar_to<CTYPE_COMPUTE>(b);
diff --git a/backends/cadence/fusion_g3/operators/op_exp.cpp b/backends/cadence/fusion_g3/operators/op_exp.cpp
index 4b6b898b17..41b5d70b22 100644
--- a/backends/cadence/fusion_g3/operators/op_exp.cpp
+++ b/backends/cadence/fusion_g3/operators/op_exp.cpp
@@ -49,9 +49,10 @@ Tensor& exp_out(KernelRuntimeContext& ctx, const Tensor& in, Tensor& out) {
       out);
 #endif
 
-  if (in.scalar_type() == ScalarType::Float) {
-    float* __restrict__ out_data = out.mutable_data_ptr<float>();
-    const float* __restrict__ in_data = in.const_data_ptr<float>();
+  if ((in.scalar_type() == ScalarType::Float) &&
+      (out.scalar_type() == ScalarType::Float)) {
+    float* const out_data = out.mutable_data_ptr<float>();
+    const float* const in_data = in.const_data_ptr<float>();
 
     XT_KERNEL_CHECK(
         ctx, out, xa_nn_elm_exp_f32_f32, out_data, in_data, out.numel());
diff --git a/backends/cadence/fusion_g3/operators/op_mean.cpp b/backends/cadence/fusion_g3/operators/op_mean.cpp
index 289baceb12..ae0cfd1e27 100644
--- a/backends/cadence/fusion_g3/operators/op_mean.cpp
+++ b/backends/cadence/fusion_g3/operators/op_mean.cpp
@@ -44,15 +44,16 @@ int prepare_data(
   for (int i = 0; i < num_out_dims; i++) {
     out_shape[i] = out.size(i);
   }
-
   int num_axis_dims = 0;
-  for (const auto& d : dim_list.value()) {
-    if (d < 0) {
-      p_axis[num_axis_dims] = num_inp_dims + d;
-      num_axis_dims++;
-    } else {
-      p_axis[num_axis_dims] = d;
-      num_axis_dims++;
+  if (dim_list.has_value()) {
+    for (const auto& d : dim_list.value()) {
+      if (d < 0) {
+        p_axis[num_axis_dims] = num_inp_dims + d;
+        num_axis_dims++;
+      } else {
+        p_axis[num_axis_dims] = d;
+        num_axis_dims++;
+      }
     }
   }
 
@@ -69,12 +70,6 @@ Tensor& mean_out(
   (void)ctx;
 
 #ifdef OP_ARG_CHECK
-  ET_KERNEL_CHECK(
-      ctx,
-      torch::executor::check_mean_dim_args(in, dim_list, keepdim, dtype, out),
-      InvalidArgument,
-      out);
-
   ET_KERNEL_CHECK(
       ctx,
       executorch::runtime::tensors_have_same_dim_order(in, out),
@@ -97,13 +92,14 @@ Tensor& mean_out(
 
   constexpr int kNnlibMaxDim = 5;
 
-  bool optimized = 1;
+  bool optimized = true;
 
-  if (out.scalar_type() != ScalarType::Float)
-    optimized = 0;
+  if (!((out.scalar_type() == ScalarType::Float) &&
+        (in.scalar_type() == ScalarType::Float)))
+    optimized = false;
 
   if (in.dim() > kNnlibMaxDim)
-    optimized = 0;
+    optimized = false;
 
   if (optimized) {
     float* __restrict__ p_out = out.mutable_data_ptr<float>();
@@ -135,9 +131,8 @@ Tensor& mean_out(
         num_inp_dims,
         num_out_dims);
 
-    if (num_axis_dims == num_inp_dims) {
+    if ((num_axis_dims == num_inp_dims) || (!dim_list.has_value())) {
       num_out_dims = 1;
-      out_shape[0] = 1;
     }
 
     int inp_shape_max = inp_shape[p_axis[0]];
@@ -168,29 +163,38 @@ Tensor& mean_out(
         num_axis_dims,
         p_scratch_in);
   } else {
-    ET_SWITCH_REALHB_TYPES(in.scalar_type(), ctx, "mean.out", CTYPE_IN, [&] {
-      ET_SWITCH_FLOATH_TYPES(
-          out.scalar_type(), ctx, "mean.out", CTYPE_OUT, [&] {
-            CTYPE_OUT* out_data = out.mutable_data_ptr<CTYPE_OUT>();
-            const size_t num =
-                torch::executor::get_reduced_dim_product(in, dim_list);
-            for (size_t out_ix = 0; out_ix < out.numel(); ++out_ix) {
-              CTYPE_OUT sum = 0;
-              if (in.numel() > 0) {
-                sum = torch::executor::
-                    map_reduce_over_dim_list<CTYPE_IN, CTYPE_OUT>(
-                        [](CTYPE_IN v) { return static_cast<CTYPE_OUT>(v); },
-                        [](CTYPE_OUT outv, CTYPE_OUT acc) {
-                          return acc + outv;
-                        },
-                        in,
-                        dim_list,
-                        out_ix);
-              }
-              out_data[out_ix] = sum / static_cast<float>(num);
-            }
-          });
-    });
+    ET_KERNEL_CHECK(
+        ctx,
+        torch::executor::check_mean_dim_args(in, dim_list, keepdim, dtype, out),
+        InvalidArgument,
+        out);
+
+    ET_SWITCH_REALHBBF16_TYPES(
+        in.scalar_type(), ctx, "mean.out", CTYPE_IN, [&] {
+          ET_SWITCH_FLOATHBF16_TYPES(
+              out.scalar_type(), ctx, "mean.out", CTYPE_OUT, [&] {
+                CTYPE_OUT* out_data = out.mutable_data_ptr<CTYPE_OUT>();
+                const size_t num =
+                    torch::executor::get_reduced_dim_product(in, dim_list);
+                for (size_t out_ix = 0; out_ix < out.numel(); ++out_ix) {
+                  CTYPE_OUT sum = 0;
+                  if (in.numel() > 0) {
+                    sum = torch::executor::
+                        map_reduce_over_dim_list<CTYPE_IN, CTYPE_OUT>(
+                            [](CTYPE_IN v) {
+                              return static_cast<CTYPE_OUT>(v);
+                            },
+                            [](CTYPE_OUT outv, CTYPE_OUT acc) {
+                              return acc + outv;
+                            },
+                            in,
+                            dim_list,
+                            out_ix);
+                  }
+                  out_data[out_ix] = sum / static_cast<float>(num);
+                }
+              });
+        });
   }
 
   return out;
diff --git a/backends/cadence/fusion_g3/operators/op_mul.cpp b/backends/cadence/fusion_g3/operators/op_mul.cpp
index 93b4c5a992..bee6ac9cbd 100644
--- a/backends/cadence/fusion_g3/operators/op_mul.cpp
+++ b/backends/cadence/fusion_g3/operators/op_mul.cpp
@@ -33,15 +33,7 @@ Tensor& mul_out(
     const Tensor& a,
     const Tensor& b,
     Tensor& out) {
-  // Common Dtype
-  ScalarType common_type =
-      executorch::runtime::promoteTypes(a.scalar_type(), b.scalar_type());
-
 #ifdef OP_ARG_CHECK
-  // Check Common Dtype
-  ET_KERNEL_CHECK(
-      ctx, canCast(common_type, out.scalar_type()), InvalidArgument, out);
-
   // Check Dim Order
   ET_KERNEL_CHECK(
       ctx,
@@ -57,10 +49,6 @@ Tensor& mul_out(
       out);
 #endif
 
-  // Compute Dtype
-  ScalarType compute_type =
-      torch::executor::native::utils::get_compute_type(common_type);
-
   // @lint-ignore CLANGTIDY facebook-hte-CArray
   static constexpr const char op_name[] = "mul.out";
   int kTensorDimensionLimit = 5;
@@ -69,12 +57,12 @@ Tensor& mul_out(
   int inp2_shape[kTensorDimensionLimit];
   int out_shape[kTensorDimensionLimit];
 
-  bool broadcast = 0;
+  bool broadcast = false;
 
   int max_dim = a.dim() > b.dim() ? a.dim() : b.dim();
   max_dim = out.dim() > max_dim ? out.dim() : max_dim;
 
-  bool optimized = 1;
+  bool optimized = true;
 
   /* Added change to work with input dimensions more than 5 */
   for (int i = 0; i < max_dim; i++) {
@@ -101,15 +89,19 @@ Tensor& mul_out(
   for (int i = 0; i < out.dim(); i++) {
     if (((inp1_shape[i]) != (out_shape[i])) ||
         ((inp2_shape[i]) != (out_shape[i]))) {
-      broadcast = 1;
+      broadcast = true;
     }
   }
 
-  if ((broadcast == 1) && (max_dim > kTensorDimensionLimit)) {
-    optimized = 0;
+  if (((broadcast) && (max_dim > kTensorDimensionLimit)) ||
+      (!(((a.scalar_type() == ScalarType::Int) ||
+          (a.scalar_type() == ScalarType::Float)) &&
+         (a.scalar_type() == b.scalar_type()) &&
+         (a.scalar_type() == out.scalar_type())))) {
+    optimized = false;
   }
 
-  if ((compute_type == ScalarType::Int) && (optimized)) {
+  if ((a.scalar_type() == ScalarType::Int) && (optimized)) {
     const int* const inp1_data = a.const_data_ptr<int>();
     const int* const inp2_data = b.const_data_ptr<int>();
     int* const out_data = out.mutable_data_ptr<int>();
@@ -154,7 +146,7 @@ Tensor& mul_out(
           inp2_data,
           out.numel());
     }
-  } else if ((compute_type == ScalarType::Float) && (optimized)) {
+  } else if ((a.scalar_type() == ScalarType::Float) && (optimized)) {
     const float* const inp1_data = a.const_data_ptr<float>();
     const float* const inp2_data = b.const_data_ptr<float>();
     float* const out_data = out.mutable_data_ptr<float>();
@@ -200,6 +192,16 @@ Tensor& mul_out(
           out.numel());
     }
   } else {
+    // Common Dtype
+    ScalarType common_type =
+        executorch::runtime::promoteTypes(a.scalar_type(), b.scalar_type());
+    // Compute Dtype
+    ScalarType compute_type =
+        torch::executor::native::utils::get_compute_type(common_type);
+    // Check Common Dtype
+    ET_KERNEL_CHECK(
+        ctx, canCast(common_type, out.scalar_type()), InvalidArgument, out);
+
     ET_SWITCH_REALB_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() {
       torch::executor::native::utils::apply_bitensor_elementwise_fn<
           CTYPE_COMPUTE,
@@ -224,15 +226,7 @@ Tensor& mul_scalar_out(
     const Tensor& a,
     const Scalar& b,
     Tensor& out) {
-  // Common Dtype
-  ScalarType common_type =
-      torch::executor::native::utils::promote_type_with_scalar(
-          a.scalar_type(), b);
-
 #ifdef OP_ARG_CHECK
-  // Check Common Dtype
-  ET_KERNEL_CHECK(ctx, common_type == out.scalar_type(), InvalidArgument, out);
-
   // Check Dim Order
   ET_KERNEL_CHECK(
       ctx,
@@ -244,13 +238,23 @@ Tensor& mul_scalar_out(
   ET_KERNEL_CHECK(
       ctx, resize_tensor(out, a.sizes()) == Error::Ok, InvalidArgument, out);
 #endif
-  // Compute Dtype
-  ScalarType compute_type =
-      torch::executor::native::utils::get_compute_type(common_type);
 
   // @lint-ignore CLANGTIDY facebook-hte-CArray
   static constexpr const char op_name[] = "mul.Scalar_out";
-  if (compute_type == ScalarType::Int) {
+
+  bool optimized = true;
+
+  if (!(((a.scalar_type() == ScalarType::Int) ||
+         (a.scalar_type() == ScalarType::Float)) &&
+        (a.scalar_type() == out.scalar_type()))) {
+    optimized = false;
+  }
+
+  if ((b.isFloatingPoint()) && (a.scalar_type() == ScalarType::Int)) {
+    optimized = false;
+  }
+
+  if ((a.scalar_type() == ScalarType::Int) && (optimized)) {
     const int* const inp1_data = a.const_data_ptr<int>();
     int inp2_val;
     torch::executor::native::utils::extract_scalar(b, &inp2_val);
@@ -264,7 +268,7 @@ Tensor& mul_scalar_out(
         inp1_data,
         inp2_val,
         out.numel());
-  } else if (compute_type == ScalarType::Float) {
+  } else if ((a.scalar_type() == ScalarType::Float) && (optimized)) {
     const float* const inp1_data = a.const_data_ptr<float>();
     float inp2_val;
     torch::executor::native::utils::extract_scalar(b, &inp2_val);
@@ -279,6 +283,17 @@ Tensor& mul_scalar_out(
         inp2_val,
         out.numel());
   } else {
+    // Common Dtype
+    ScalarType common_type =
+        torch::executor::native::utils::promote_type_with_scalar(
+            a.scalar_type(), b);
+    // Compute Dtype
+    ScalarType compute_type =
+        torch::executor::native::utils::get_compute_type(common_type);
+    // Check Common Dtype
+    ET_KERNEL_CHECK(
+        ctx, common_type == out.scalar_type(), InvalidArgument, out);
+
     ET_SWITCH_REALB_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() {
       const CTYPE_COMPUTE val_b =
           torch::executor::native::utils::scalar_to<CTYPE_COMPUTE>(b);
diff --git a/backends/cadence/fusion_g3/operators/op_native_layer_norm.cpp b/backends/cadence/fusion_g3/operators/op_native_layer_norm.cpp
index 9857bbce37..b4f076e810 100644
--- a/backends/cadence/fusion_g3/operators/op_native_layer_norm.cpp
+++ b/backends/cadence/fusion_g3/operators/op_native_layer_norm.cpp
@@ -123,14 +123,7 @@ std::tuple<Tensor&, Tensor&, Tensor&> native_layer_norm_out(
 
   std::tuple<Tensor&, Tensor&, Tensor&> ret_val(out, mean_out, rstd_out);
   int kTensorDimensionLimit = executorch::runtime::kTensorDimensionLimit;
-
 #ifdef OP_ARG_CHECK
-  ET_KERNEL_CHECK(
-      ctx,
-      torch::executor::check_layer_norm_args(
-          input, normalized_shape, weight, bias, out, mean_out, rstd_out),
-      InvalidArgument,
-      ret_val);
 
   // Only support default dim order for now.
   // TODO: Support other dim orders.
@@ -189,12 +182,34 @@ std::tuple<Tensor&, Tensor&, Tensor&> native_layer_norm_out(
       ret_val);
 #endif
 
+  bool optimized = true;
+
   int input_shape[kTensorDimensionLimit];
   for (int i = 0; i < input.dim(); i++) {
     input_shape[i] = input.size(i);
   }
 
-  if (out.scalar_type() == ScalarType::Float) {
+  if (!(((input.scalar_type() == ScalarType::Float) &&
+         (input.scalar_type() == out.scalar_type()) &&
+         (out.scalar_type() == mean_out.scalar_type()) &&
+         (mean_out.scalar_type() == rstd_out.scalar_type())))) {
+    optimized = false;
+  }
+
+  if (optimized) {
+    if (weight.has_value()) {
+      if (!(input.scalar_type() == weight.value().scalar_type())) {
+        optimized = false;
+      }
+    }
+    if (bias.has_value()) {
+      if (!(input.scalar_type() == bias.value().scalar_type())) {
+        optimized = false;
+      }
+    }
+  }
+
+  if ((input.scalar_type() == ScalarType::Float) && (optimized)) {
     float* const out_data = out.mutable_data_ptr<float>();
     float* const mean_data = mean_out.mutable_data_ptr<float>();
     float* const rstd_data = rstd_out.mutable_data_ptr<float>();
@@ -247,6 +262,13 @@ std::tuple<Tensor&, Tensor&, Tensor&> native_layer_norm_out(
       free(weight_data);
     }
   } else {
+    ET_KERNEL_CHECK(
+        ctx,
+        torch::executor::check_layer_norm_args(
+            input, normalized_shape, weight, bias, out, mean_out, rstd_out),
+        InvalidArgument,
+        ret_val);
+
     ET_SWITCH_FLOAT_TYPES(
         input.scalar_type(), ctx, "native_layer_norm.out", CTYPE, [&]() {
           layer_norm<CTYPE>(
diff --git a/backends/cadence/fusion_g3/operators/op_permute_copy.cpp b/backends/cadence/fusion_g3/operators/op_permute_copy.cpp
index 23c2d1e5fb..34def4fd1b 100644
--- a/backends/cadence/fusion_g3/operators/op_permute_copy.cpp
+++ b/backends/cadence/fusion_g3/operators/op_permute_copy.cpp
@@ -65,12 +65,6 @@ Tensor& permute_copy_out(
    * the checks only in operator level(As there are no checks in kernel).
    */
 #ifdef OP_ARG_CHECK
-  ET_KERNEL_CHECK(
-      ctx,
-      torch::executor::check_permute_copy_args(in, dims, out),
-      InvalidArgument,
-      out);
-
   ET_KERNEL_CHECK(
       ctx,
       executorch::runtime::tensors_have_same_dim_order(in, out),
@@ -112,7 +106,8 @@ Tensor& permute_copy_out(
   signed char* out_data = out.mutable_data_ptr<signed char>();
   const signed char* const inp_data = in.const_data_ptr<signed char>();
 
-  if (((out.scalar_type() == ScalarType::Int) ||
+  if (((out.scalar_type() == in.scalar_type()) &&
+           (out.scalar_type() == ScalarType::Int) ||
        (out.scalar_type() == ScalarType::Short) ||
        (out.scalar_type() == ScalarType::Char) ||
        (out.scalar_type() == ScalarType::UInt32) ||
@@ -131,9 +126,15 @@ Tensor& permute_copy_out(
         in.dim(),
         get_element_size(out.scalar_type()));
   } else {
+    ET_KERNEL_CHECK(
+        ctx,
+        torch::executor::check_permute_copy_args(in, dims, out),
+        InvalidArgument,
+        out);
+
     const auto in_type = out.scalar_type();
-    size_t in_coord[5] = {0};
-    size_t trailing_dims_memo[kTensorDimensionLimit];
+    size_t in_coord[executorch::runtime::kTensorDimensionLimit] = {0};
+    size_t trailing_dims_memo[executorch::runtime::kTensorDimensionLimit];
     executorch::runtime::memoizeTrailingDims(in, trailing_dims_memo);
     // in and out must be the same dtype
     ET_SWITCH_ALL_TYPES(in_type, ctx, "permute_copy.out", CTYPE, [&] {
diff --git a/backends/cadence/fusion_g3/operators/op_quantize.cpp b/backends/cadence/fusion_g3/operators/op_quantize.cpp
index 8237c3c266..2af77eca6c 100644
--- a/backends/cadence/fusion_g3/operators/op_quantize.cpp
+++ b/backends/cadence/fusion_g3/operators/op_quantize.cpp
@@ -159,6 +159,12 @@ Tensor& quantize_impl(
 
   bool is_asym_quant = 0;
 
+  bool optimized = true;
+
+  if (input.scalar_type() != ScalarType::Float) {
+    optimized = false;
+  }
+
   if (zero_point_data != NULL) // asymmetric quant
   {
     if (axis != NULL) // channel
@@ -177,7 +183,7 @@ Tensor& quantize_impl(
   }
 
   if (is_asym_quant) {
-    if (out.scalar_type() == ScalarType::Byte) {
+    if ((out.scalar_type() == ScalarType::Byte) && (optimized)) {
       uint8_t* out_data = out.mutable_data_ptr<uint8_t>();
       XT_KERNEL_CHECK(
           ctx,
@@ -192,7 +198,7 @@ Tensor& quantize_impl(
           zero_point_data,
           quant_min,
           quant_max);
-    } else if (out.scalar_type() == ScalarType::Char) {
+    } else if ((out.scalar_type() == ScalarType::Char) && (optimized)) {
       int8_t* out_data = out.mutable_data_ptr<int8_t>();
 
       XT_KERNEL_CHECK(
@@ -208,7 +214,7 @@ Tensor& quantize_impl(
           zero_point_data,
           quant_min,
           quant_max);
-    } else if (out.scalar_type() == ScalarType::UInt16) {
+    } else if ((out.scalar_type() == ScalarType::UInt16) && (optimized)) {
       uint16_t* out_data = out.mutable_data_ptr<uint16_t>();
       XT_KERNEL_CHECK(
           ctx,
@@ -223,7 +229,7 @@ Tensor& quantize_impl(
           zero_point_data,
           quant_min,
           quant_max);
-    } else if (out.scalar_type() == ScalarType::Short) {
+    } else if ((out.scalar_type() == ScalarType::Short) && (optimized)) {
       int16_t* out_data = out.mutable_data_ptr<int16_t>();
       XT_KERNEL_CHECK(
           ctx,
@@ -238,7 +244,7 @@ Tensor& quantize_impl(
           zero_point_data,
           quant_min,
           quant_max);
-    } else if (out.scalar_type() == (ScalarType)Bits4u) {
+    } else if ((out.scalar_type() == (ScalarType)Bits4u) && (optimized)) {
       uint8_t* out_data = out.mutable_data_ptr<uint8_t>();
       XT_KERNEL_CHECK(
           ctx,
@@ -253,7 +259,7 @@ Tensor& quantize_impl(
           zero_point_data,
           quant_min,
           quant_max);
-    } else if (out.scalar_type() == (ScalarType)Bits4) {
+    } else if ((out.scalar_type() == (ScalarType)Bits4) && (optimized)) {
       int8_t* out_data = out.mutable_data_ptr<int8_t>();
       XT_KERNEL_CHECK(
           ctx,
@@ -391,7 +397,7 @@ Tensor& quantize_impl(
 #undef ASYM_QUANTIZE_IMPL_CHANNEL
     }
   } else {
-    if (out.scalar_type() == ScalarType::Byte) {
+    if ((out.scalar_type() == ScalarType::Byte) && (optimized)) {
       uint8_t* out_data = out.mutable_data_ptr<uint8_t>();
       XT_KERNEL_CHECK(
           ctx,
@@ -405,7 +411,7 @@ Tensor& quantize_impl(
           scale_data,
           quant_min,
           quant_max);
-    } else if (out.scalar_type() == ScalarType::Char) {
+    } else if ((out.scalar_type() == ScalarType::Char) && (optimized)) {
       int8_t* out_data = out.mutable_data_ptr<int8_t>();
       XT_KERNEL_CHECK(
           ctx,
@@ -419,7 +425,7 @@ Tensor& quantize_impl(
           scale_data,
           quant_min,
           quant_max);
-    } else if (out.scalar_type() == ScalarType::UInt16) {
+    } else if ((out.scalar_type() == ScalarType::UInt16) && (optimized)) {
       uint16_t* out_data = out.mutable_data_ptr<uint16_t>();
       XT_KERNEL_CHECK(
           ctx,
@@ -433,7 +439,7 @@ Tensor& quantize_impl(
           scale_data,
           quant_min,
           quant_max);
-    } else if (out.scalar_type() == ScalarType::Short) {
+    } else if ((out.scalar_type() == ScalarType::Short) && (optimized)) {
       int16_t* out_data = out.mutable_data_ptr<int16_t>();
       XT_KERNEL_CHECK(
           ctx,
@@ -447,7 +453,7 @@ Tensor& quantize_impl(
           scale_data,
           quant_min,
           quant_max);
-    } else if (out.scalar_type() == (ScalarType)Bits4u) {
+    } else if ((out.scalar_type() == (ScalarType)Bits4u) && (optimized)) {
       uint8_t* out_data = out.mutable_data_ptr<uint8_t>();
       XT_KERNEL_CHECK(
           ctx,
@@ -461,7 +467,7 @@ Tensor& quantize_impl(
           scale_data,
           quant_min,
           quant_max);
-    } else if (out.scalar_type() == (ScalarType)Bits4) {
+    } else if ((out.scalar_type() == (ScalarType)Bits4) && (optimized)) {
       int8_t* out_data = out.mutable_data_ptr<int8_t>();
       XT_KERNEL_CHECK(
           ctx,
diff --git a/backends/cadence/fusion_g3/operators/op_slice_copy.cpp b/backends/cadence/fusion_g3/operators/op_slice_copy.cpp
index c481cf726b..9158eecf13 100644
--- a/backends/cadence/fusion_g3/operators/op_slice_copy.cpp
+++ b/backends/cadence/fusion_g3/operators/op_slice_copy.cpp
@@ -58,12 +58,6 @@ Tensor& slice_copy_Tensor_out(
   int kTensorDimensionLimit = executorch::runtime::kTensorDimensionLimit;
 
 #ifdef OP_ARG_CHECK
-  ET_KERNEL_CHECK(
-      ctx,
-      torch::executor::check_slice_copy_args(in, dim, step, out),
-      InvalidArgument,
-      out);
-
   ET_KERNEL_CHECK(
       ctx,
       executorch::runtime::tensors_have_same_dim_order(in, out),
@@ -101,12 +95,13 @@ Tensor& slice_copy_Tensor_out(
   signed char* out_data = out.mutable_data_ptr<signed char>();
   const signed char* const inp_data = in.const_data_ptr<signed char>();
 
-  if ((out.scalar_type() == ScalarType::Int) ||
-      (out.scalar_type() == ScalarType::Short) ||
-      (out.scalar_type() == ScalarType::Char) ||
-      (out.scalar_type() == ScalarType::UInt32) ||
-      (out.scalar_type() == ScalarType::UInt16) ||
-      (out.scalar_type() == ScalarType::Byte)) {
+  if ((out.scalar_type() == in.scalar_type()) &&
+      ((out.scalar_type() == ScalarType::Int) ||
+       (out.scalar_type() == ScalarType::Short) ||
+       (out.scalar_type() == ScalarType::Char) ||
+       (out.scalar_type() == ScalarType::UInt32) ||
+       (out.scalar_type() == ScalarType::UInt16) ||
+       (out.scalar_type() == ScalarType::Byte))) {
     XT_KERNEL_CHECK(
         ctx,
         out,
@@ -122,6 +117,12 @@ Tensor& slice_copy_Tensor_out(
         (int)dim,
         get_element_size(out.scalar_type()));
   } else {
+    ET_KERNEL_CHECK(
+        ctx,
+        torch::executor::check_slice_copy_args(in, dim, step, out),
+        InvalidArgument,
+        out);
+
     torch::executor::compute_slice(in, dim, start, length, step, out);
   }
 
diff --git a/backends/cadence/fusion_g3/operators/op_softmax.cpp b/backends/cadence/fusion_g3/operators/op_softmax.cpp
index ee87ebaf5a..14b128e928 100644
--- a/backends/cadence/fusion_g3/operators/op_softmax.cpp
+++ b/backends/cadence/fusion_g3/operators/op_softmax.cpp
@@ -39,14 +39,7 @@ Tensor& _softmax_out(
 
   // Adjust for negative dim
   dim = dim < 0 ? dim + executorch::runtime::nonzero_dim(in) : dim;
-
 #ifdef OP_ARG_CHECK
-  ET_KERNEL_CHECK(
-      ctx,
-      torch::executor::check_softmax_args(in, dim, half_to_float, out),
-      InvalidArgument,
-      out);
-
   ET_KERNEL_CHECK(
       ctx, resize_tensor(out, in.sizes()) == Error::Ok, InvalidArgument, out);
 
@@ -63,7 +56,8 @@ Tensor& _softmax_out(
     inp_shapes[i] = in_size[i];
   }
 
-  if (out.scalar_type() == ScalarType::Float) {
+  if ((in.scalar_type() == ScalarType::Float) &&
+      (out.scalar_type() == ScalarType::Float)) {
     const float* const inp_data = in.const_data_ptr<float>();
     float* const out_data = out.mutable_data_ptr<float>();
     int axis = dim;
@@ -77,6 +71,12 @@ Tensor& _softmax_out(
         in.dim(),
         &axis);
   } else {
+    ET_KERNEL_CHECK(
+        ctx,
+        torch::executor::check_softmax_args(in, dim, half_to_float, out),
+        InvalidArgument,
+        out);
+
     ET_SWITCH_FLOATH_TYPES(in.scalar_type(), ctx, "_softmax.out", CTYPE, [&]() {
       const CTYPE* const in_data = in.const_data_ptr<CTYPE>();
       CTYPE* const out_data = out.mutable_data_ptr<CTYPE>();
diff --git a/backends/cadence/fusion_g3/operators/op_sub.cpp b/backends/cadence/fusion_g3/operators/op_sub.cpp
index 4bae81c5b2..9bafec5df9 100644
--- a/backends/cadence/fusion_g3/operators/op_sub.cpp
+++ b/backends/cadence/fusion_g3/operators/op_sub.cpp
@@ -35,19 +35,6 @@ Tensor& sub_out(
     const Scalar& alpha,
     Tensor& out) {
 #ifdef OP_ARG_CHECK
-  ScalarType alpha_type =
-      torch::executor::native::utils::get_scalar_dtype(alpha);
-  // Check alpha type
-  ET_KERNEL_CHECK(ctx, alpha_type != ScalarType::Bool, InvalidArgument, out);
-
-  // Check Common Dtype
-  ET_KERNEL_CHECK(
-      ctx,
-      (canCast(common_type, out.scalar_type()) &&
-       canCast(alpha_type, common_type)),
-      InvalidArgument,
-      out);
-
   // Check Dim Order
   ET_KERNEL_CHECK(
       ctx,
@@ -72,12 +59,12 @@ Tensor& sub_out(
   int inp2_shape[kTensorDimensionLimit];
   int out_shape[kTensorDimensionLimit];
 
-  bool broadcast = 0;
+  bool broadcast = false;
 
   int max_dim = a.dim() > b.dim() ? a.dim() : b.dim();
   max_dim = out.dim() > max_dim ? out.dim() : max_dim;
 
-  bool optimized = 1;
+  bool optimized = true;
 
   for (int i = 0; i < max_dim; i++) {
     out_shape[i] = 1;
@@ -103,16 +90,16 @@ Tensor& sub_out(
   for (int i = 0; i < out.dim(); i++) {
     if (((inp1_shape[i]) != (out_shape[i])) ||
         ((inp2_shape[i]) != (out_shape[i]))) {
-      broadcast = 1;
+      broadcast = true;
     }
   }
 
-  if (((broadcast == 1) && (max_dim > kTensorDimensionLimit)) ||
+  if (((broadcast) && (max_dim > kTensorDimensionLimit)) ||
       (!(((a.scalar_type() == ScalarType::Int) ||
           (a.scalar_type() == ScalarType::Float)) &&
          (a.scalar_type() == b.scalar_type()) &&
          (a.scalar_type() == out.scalar_type())))) {
-    optimized = 0;
+    optimized = false;
   }
 
   if ((a.scalar_type() == ScalarType::Int) && (optimized)) {
@@ -207,6 +194,19 @@ Tensor& sub_out(
     ScalarType compute_type =
         torch::executor::native::utils::get_compute_type(common_type);
 
+    ScalarType alpha_type =
+        torch::executor::native::utils::get_scalar_dtype(alpha);
+    // Check alpha type
+    ET_KERNEL_CHECK(ctx, alpha_type != ScalarType::Bool, InvalidArgument, out);
+
+    // Check Common Dtype
+    ET_KERNEL_CHECK(
+        ctx,
+        (canCast(common_type, out.scalar_type()) &&
+         canCast(alpha_type, common_type)),
+        InvalidArgument,
+        out);
+
     ET_SWITCH_REAL_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() {
       const CTYPE_COMPUTE val_alpha =
           torch::executor::native::utils::scalar_to<CTYPE_COMPUTE>(alpha);
@@ -236,18 +236,6 @@ Tensor& sub_scalar_out(
     const Scalar& alpha,
     Tensor& out) {
 #ifdef OP_ARG_CHECK
-  ScalarType alpha_type =
-      torch::executor::native::utils::get_scalar_dtype(alpha);
-  // Check alpha type
-  ET_KERNEL_CHECK(ctx, alpha_type != ScalarType::Bool, InvalidArgument, out);
-
-  // Check Common Dtype
-  ET_KERNEL_CHECK(
-      ctx,
-      (common_type == out.scalar_type() && canCast(alpha_type, common_type)),
-      InvalidArgument,
-      out);
-
   // Check Dim Order
   ET_KERNEL_CHECK(
       ctx,
@@ -266,14 +254,16 @@ Tensor& sub_scalar_out(
   // @lint-ignore CLANGTIDY facebook-hte-CArray
   static constexpr const char op_name[] = "sub.Scalar_out";
 
-  bool optimized = 1;
-  ScalarType b_type = torch::executor::native::utils::get_scalar_dtype(b);
+  bool optimized = true;
 
   if (!(((a.scalar_type() == ScalarType::Int) ||
          (a.scalar_type() == ScalarType::Float)) &&
-        (a.scalar_type() == b_type) &&
         (a.scalar_type() == out.scalar_type()))) {
-    optimized = 0;
+    optimized = false;
+  }
+
+  if ((b.isFloatingPoint()) && (a.scalar_type() == ScalarType::Int)) {
+    optimized = false;
   }
 
   if ((a.scalar_type() == ScalarType::Int) && (optimized)) {
@@ -322,6 +312,19 @@ Tensor& sub_scalar_out(
     // Compute Dtype
     ScalarType compute_type =
         torch::executor::native::utils::get_compute_type(common_type);
+
+    ScalarType alpha_type =
+        torch::executor::native::utils::get_scalar_dtype(alpha);
+    // Check alpha type
+    ET_KERNEL_CHECK(ctx, alpha_type != ScalarType::Bool, InvalidArgument, out);
+
+    // Check Common Dtype
+    ET_KERNEL_CHECK(
+        ctx,
+        (common_type == out.scalar_type() && canCast(alpha_type, common_type)),
+        InvalidArgument,
+        out);
+
     ET_SWITCH_REAL_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() {
       const CTYPE_COMPUTE val_b =
           torch::executor::native::utils::scalar_to<CTYPE_COMPUTE>(b);
diff --git a/backends/cadence/hifi/operators/op_add.cpp b/backends/cadence/hifi/operators/op_add.cpp
index ec0e48e379..3a590ea071 100644
--- a/backends/cadence/hifi/operators/op_add.cpp
+++ b/backends/cadence/hifi/operators/op_add.cpp
@@ -16,9 +16,9 @@
 #include <executorch/runtime/kernel/kernel_includes.h>
 #include <executorch/runtime/platform/assert.h>
 
-using exec_aten::Scalar;
-using exec_aten::ScalarType;
-using exec_aten::Tensor;
+using executorch::aten::Scalar;
+using executorch::aten::ScalarType;
+using executorch::aten::Tensor;
 using executorch::runtime::can_cast;
 using executorch::runtime::CppTypeToScalarType;
 using executorch::runtime::KernelRuntimeContext;
diff --git a/backends/cadence/hifi/operators/op_cat.cpp b/backends/cadence/hifi/operators/op_cat.cpp
index e367d71b79..8ad52753de 100644
--- a/backends/cadence/hifi/operators/op_cat.cpp
+++ b/backends/cadence/hifi/operators/op_cat.cpp
@@ -30,7 +30,7 @@ namespace native {
 
 Tensor& cat_out(
     RuntimeContext& ctx,
-    exec_aten::ArrayRef<Tensor> tensors,
+    executorch::aten::ArrayRef<Tensor> tensors,
     int64_t dim,
     Tensor& out) {
   if (dim < 0) {
diff --git a/backends/cadence/hifi/operators/op_clamp.cpp b/backends/cadence/hifi/operators/op_clamp.cpp
index d31161a7d5..4fa29c00dd 100644
--- a/backends/cadence/hifi/operators/op_clamp.cpp
+++ b/backends/cadence/hifi/operators/op_clamp.cpp
@@ -51,8 +51,8 @@ namespace native {
 Tensor& clamp_tensor_out(
     RuntimeContext& ctx,
     const Tensor& in,
-    const exec_aten::optional<Tensor>& min_opt,
-    const exec_aten::optional<Tensor>& max_opt,
+    const executorch::aten::optional<Tensor>& min_opt,
+    const executorch::aten::optional<Tensor>& max_opt,
     Tensor& out) {
   (void)ctx;
 
diff --git a/backends/cadence/hifi/operators/dequantize_per_tensor.cpp b/backends/cadence/hifi/operators/op_dequantize_per_tensor.cpp
similarity index 100%
rename from backends/cadence/hifi/operators/dequantize_per_tensor.cpp
rename to backends/cadence/hifi/operators/op_dequantize_per_tensor.cpp
diff --git a/backends/cadence/hifi/operators/op_div.cpp b/backends/cadence/hifi/operators/op_div.cpp
index 05f3db7ec3..816422858b 100644
--- a/backends/cadence/hifi/operators/op_div.cpp
+++ b/backends/cadence/hifi/operators/op_div.cpp
@@ -17,10 +17,10 @@
 #include <executorch/runtime/platform/assert.h>
 #include <cmath>
 
-using exec_aten::Scalar;
-using exec_aten::ScalarType;
-using exec_aten::Tensor;
 using executorch::aten::RuntimeContext;
+using executorch::aten::Scalar;
+using executorch::aten::ScalarType;
+using executorch::aten::Tensor;
 using torch::executor::Error;
 
 namespace cadence {
@@ -165,7 +165,7 @@ Tensor& div_out_mode(
     RuntimeContext& ctx,
     const Tensor& a,
     const Tensor& b,
-    exec_aten::optional<exec_aten::string_view> mode,
+    executorch::aten::optional<executorch::aten::string_view> mode,
     Tensor& out) {
   ET_KERNEL_CHECK(
       ctx,
diff --git a/backends/cadence/hifi/operators/op_maximum.cpp b/backends/cadence/hifi/operators/op_maximum.cpp
index f85d3470e9..592ea3bc1e 100644
--- a/backends/cadence/hifi/operators/op_maximum.cpp
+++ b/backends/cadence/hifi/operators/op_maximum.cpp
@@ -12,9 +12,9 @@
 #include <executorch/kernels/portable/cpu/util/math_util.h>
 #include <executorch/runtime/kernel/kernel_includes.h>
 
-using exec_aten::ScalarType;
-using exec_aten::Tensor;
 using executorch::aten::RuntimeContext;
+using executorch::aten::ScalarType;
+using executorch::aten::Tensor;
 using executorch::runtime::can_cast;
 using executorch::runtime::canCast;
 using executorch::runtime::CppTypeToScalarType;
diff --git a/backends/cadence/hifi/operators/op_mean.cpp b/backends/cadence/hifi/operators/op_mean.cpp
index 342c982a07..82fa7502de 100644
--- a/backends/cadence/hifi/operators/op_mean.cpp
+++ b/backends/cadence/hifi/operators/op_mean.cpp
@@ -56,7 +56,7 @@ int prepare_data(
   return num_axis_dims;
 }
 
-Tensor& mean_dim_out(
+Tensor& mean_out(
     RuntimeContext& ctx,
     const Tensor& in,
     optional<ArrayRef<int64_t>> dim_list,
diff --git a/backends/cadence/hifi/operators/op_minimum.cpp b/backends/cadence/hifi/operators/op_minimum.cpp
index 6f81ad5c3e..b78ee64882 100644
--- a/backends/cadence/hifi/operators/op_minimum.cpp
+++ b/backends/cadence/hifi/operators/op_minimum.cpp
@@ -12,9 +12,9 @@
 #include <executorch/kernels/portable/cpu/util/math_util.h>
 #include <executorch/runtime/kernel/kernel_includes.h>
 
-using exec_aten::ScalarType;
-using exec_aten::Tensor;
 using executorch::aten::RuntimeContext;
+using executorch::aten::ScalarType;
+using executorch::aten::Tensor;
 using executorch::runtime::can_cast;
 using executorch::runtime::canCast;
 using executorch::runtime::CppTypeToScalarType;
diff --git a/backends/cadence/hifi/operators/op_mul.cpp b/backends/cadence/hifi/operators/op_mul.cpp
index 396833dd1a..b8c3ab7c02 100644
--- a/backends/cadence/hifi/operators/op_mul.cpp
+++ b/backends/cadence/hifi/operators/op_mul.cpp
@@ -15,10 +15,10 @@
 #include <executorch/runtime/kernel/kernel_includes.h>
 #include <executorch/runtime/platform/assert.h>
 
-using exec_aten::Scalar;
-using exec_aten::ScalarType;
-using exec_aten::Tensor;
 using executorch::aten::RuntimeContext;
+using executorch::aten::Scalar;
+using executorch::aten::ScalarType;
+using executorch::aten::Tensor;
 using executorch::runtime::can_cast;
 using executorch::runtime::CppTypeToScalarType;
 using torch::executor::Error;
diff --git a/backends/cadence/hifi/operators/quantize_per_tensor.cpp b/backends/cadence/hifi/operators/op_quantize_per_tensor.cpp
similarity index 100%
rename from backends/cadence/hifi/operators/quantize_per_tensor.cpp
rename to backends/cadence/hifi/operators/op_quantize_per_tensor.cpp
diff --git a/backends/cadence/hifi/operators/quantized_layer_norm.cpp b/backends/cadence/hifi/operators/op_quantized_layer_norm.cpp
similarity index 100%
rename from backends/cadence/hifi/operators/quantized_layer_norm.cpp
rename to backends/cadence/hifi/operators/op_quantized_layer_norm.cpp
diff --git a/backends/cadence/hifi/operators/quantized_linear_out.cpp b/backends/cadence/hifi/operators/op_quantized_linear_out.cpp
similarity index 97%
rename from backends/cadence/hifi/operators/quantized_linear_out.cpp
rename to backends/cadence/hifi/operators/op_quantized_linear_out.cpp
index b8e1d117fb..3d9983b40c 100644
--- a/backends/cadence/hifi/operators/quantized_linear_out.cpp
+++ b/backends/cadence/hifi/operators/op_quantized_linear_out.cpp
@@ -219,7 +219,7 @@ void quantized_linear_out(
     int64_t out_zero_point,
     __ET_UNUSED const optional<Tensor>& offset,
     Tensor& out) {
-  if (out.scalar_type() == exec_aten::ScalarType::Byte) {
+  if (out.scalar_type() == executorch::aten::ScalarType::Byte) {
     _quantized_linear_asym8u(
         in,
         weight,
@@ -231,7 +231,7 @@ void quantized_linear_out(
         out_zero_point,
         offset,
         out);
-  } else if (out.scalar_type() == exec_aten::ScalarType::Char) {
+  } else if (out.scalar_type() == executorch::aten::ScalarType::Char) {
     _quantized_linear_asym8s(
         in,
         weight,
@@ -261,7 +261,7 @@ void quantized_linear_per_tensor_out(
     int64_t out_zero_point,
     __ET_UNUSED const optional<Tensor>& offset,
     Tensor& out) {
-  if (out.scalar_type() == exec_aten::ScalarType::Byte) {
+  if (out.scalar_type() == executorch::aten::ScalarType::Byte) {
     _quantized_linear_per_tensor_asym8u(
         in,
         weight,
@@ -273,7 +273,7 @@ void quantized_linear_per_tensor_out(
         out_zero_point,
         offset,
         out);
-  } else if (out.scalar_type() == exec_aten::ScalarType::Char) {
+  } else if (out.scalar_type() == executorch::aten::ScalarType::Char) {
     _quantized_linear_per_tensor_asym8s(
         in,
         weight,
diff --git a/backends/cadence/hifi/operators/quantized_relu_out.cpp b/backends/cadence/hifi/operators/op_quantized_relu_out.cpp
similarity index 98%
rename from backends/cadence/hifi/operators/quantized_relu_out.cpp
rename to backends/cadence/hifi/operators/op_quantized_relu_out.cpp
index d78e555ad1..0860109f7c 100644
--- a/backends/cadence/hifi/operators/quantized_relu_out.cpp
+++ b/backends/cadence/hifi/operators/op_quantized_relu_out.cpp
@@ -45,7 +45,7 @@ void quantized_relu_(
   }
 }
 
-void quantized_relu_out(
+void quantized_relu_per_tensor_out(
     KernelRuntimeContext& ctx,
     const Tensor& input,
     const Tensor& in_zero_point,
@@ -100,4 +100,4 @@ void quantized_relu_out(
 } // namespace native
 } // namespace HiFi
 } // namespace impl
-} // namespace cadence
\ No newline at end of file
+} // namespace cadence
diff --git a/backends/cadence/hifi/operators/op_remainder.cpp b/backends/cadence/hifi/operators/op_remainder.cpp
index d8c4a6d2d8..99cd6ad544 100644
--- a/backends/cadence/hifi/operators/op_remainder.cpp
+++ b/backends/cadence/hifi/operators/op_remainder.cpp
@@ -8,6 +8,7 @@
 
 #include <cmath>
 
+#include <executorch/backends/cadence/hifi/kernels/kernels.h>
 #include <executorch/kernels/portable/cpu/scalar_utils.h>
 #include <executorch/kernels/portable/cpu/util/broadcast_util.h>
 #include <executorch/kernels/portable/cpu/util/elementwise_util.h>
@@ -15,8 +16,6 @@
 #include <executorch/kernels/portable/cpu/util/math_util.h>
 #include <executorch/runtime/kernel/kernel_includes.h>
 
-#include "kernels.h"
-
 using executorch::aten::RuntimeContext;
 using executorch::aten::Scalar;
 using executorch::aten::ScalarType;
diff --git a/backends/cadence/hifi/operators/op_rsqrt.cpp b/backends/cadence/hifi/operators/op_rsqrt.cpp
index 1cf717988a..885c26723a 100644
--- a/backends/cadence/hifi/operators/op_rsqrt.cpp
+++ b/backends/cadence/hifi/operators/op_rsqrt.cpp
@@ -11,9 +11,9 @@
 
 #include <executorch/backends/cadence/hifi/kernels/kernels.h>
 
-using exec_aten::ScalarType;
-using exec_aten::Tensor;
 using executorch::aten::RuntimeContext;
+using executorch::aten::ScalarType;
+using executorch::aten::Tensor;
 
 namespace cadence {
 namespace impl {
diff --git a/backends/cadence/hifi/operators/op_sigmoid.cpp b/backends/cadence/hifi/operators/op_sigmoid.cpp
index 35321cc27e..872d9255bd 100644
--- a/backends/cadence/hifi/operators/op_sigmoid.cpp
+++ b/backends/cadence/hifi/operators/op_sigmoid.cpp
@@ -14,9 +14,9 @@
 #include <executorch/kernels/portable/cpu/util/functional_util.h>
 #include <executorch/runtime/kernel/kernel_includes.h>
 
-using exec_aten::ScalarType;
-using exec_aten::Tensor;
 using executorch::aten::RuntimeContext;
+using executorch::aten::ScalarType;
+using executorch::aten::Tensor;
 using torch::executor::Error;
 
 namespace cadence {
@@ -24,7 +24,7 @@ namespace impl {
 namespace HiFi {
 namespace native {
 
-using Tensor = exec_aten::Tensor;
+using Tensor = executorch::aten::Tensor;
 
 Tensor& sigmoid_out(RuntimeContext& ctx, const Tensor& in, Tensor& out) {
   (void)ctx;
diff --git a/backends/cadence/hifi/operators/op_softmax.cpp b/backends/cadence/hifi/operators/op_softmax.cpp
index e026afd2c9..2ef233c9ff 100644
--- a/backends/cadence/hifi/operators/op_softmax.cpp
+++ b/backends/cadence/hifi/operators/op_softmax.cpp
@@ -8,11 +8,11 @@
 
 #include <cmath>
 
+#include <executorch/backends/cadence/hifi/kernels/kernels.h>
 #include <executorch/kernels/portable/cpu/util/activation_ops_util.h>
 #include <executorch/kernels/portable/cpu/util/functional_util.h>
 #include <executorch/kernels/portable/cpu/util/reduce_util.h>
 #include <executorch/runtime/kernel/kernel_includes.h>
-#include "kernels.h"
 
 using executorch::aten::ScalarType;
 using executorch::aten::Tensor;
@@ -24,7 +24,7 @@ namespace impl {
 namespace HiFi {
 namespace native {
 
-Tensor& softmax_out(
+Tensor& _softmax_out(
     KernelRuntimeContext& ctx,
     const Tensor& in,
     int64_t dim,
@@ -50,7 +50,7 @@ Tensor& softmax_out(
   // Adjust for negative dim
   dim = dim < 0 ? dim + executorch::runtime::nonzero_dim(in) : dim;
 
-  const exec_aten::optional<int64_t>& dim_t = dim;
+  const executorch::aten::optional<int64_t>& dim_t = dim;
   const size_t d = ET_NORMALIZE_IX(dim_t.value(), in.dim());
   const size_t size = in.size(d);
 
diff --git a/backends/cadence/hifi/operators/op_sub.cpp b/backends/cadence/hifi/operators/op_sub.cpp
index cf10e41435..02c8c60eac 100644
--- a/backends/cadence/hifi/operators/op_sub.cpp
+++ b/backends/cadence/hifi/operators/op_sub.cpp
@@ -16,10 +16,10 @@
 #include <executorch/runtime/kernel/kernel_includes.h>
 #include <executorch/runtime/platform/assert.h>
 
-using exec_aten::Scalar;
-using exec_aten::ScalarType;
-using exec_aten::Tensor;
 using executorch::aten::RuntimeContext;
+using executorch::aten::Scalar;
+using executorch::aten::ScalarType;
+using executorch::aten::Tensor;
 using executorch::runtime::can_cast;
 using executorch::runtime::CppTypeToScalarType;
 using torch::executor::Error;
diff --git a/backends/cadence/hifi/operators/op_tanh.cpp b/backends/cadence/hifi/operators/op_tanh.cpp
index 13578beb88..3fdd3111ef 100644
--- a/backends/cadence/hifi/operators/op_tanh.cpp
+++ b/backends/cadence/hifi/operators/op_tanh.cpp
@@ -11,9 +11,9 @@
 #include <executorch/runtime/kernel/kernel_includes.h>
 #include <cmath>
 
-using exec_aten::ScalarType;
-using exec_aten::Tensor;
 using executorch::aten::RuntimeContext;
+using executorch::aten::ScalarType;
+using executorch::aten::Tensor;
 using torch::executor::Error;
 
 namespace cadence {
diff --git a/backends/cadence/hifi/operators/targets.bzl b/backends/cadence/hifi/operators/targets.bzl
index 6c671a5f24..1c2b481410 100644
--- a/backends/cadence/hifi/operators/targets.bzl
+++ b/backends/cadence/hifi/operators/targets.bzl
@@ -1,243 +1,70 @@
 load("@fbsource//tools/build_defs:platform_defs.bzl", "CXX")
 load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
 
-def define_common_targets():
-    """Defines targets that should be shared between fbcode and xplat.
-
-    The directory containing this targets.bzl file should also contain both
-    TARGETS and BUCK files that call this function.
-    """
-
-    # Define build targets for all operators registered in the tables above.
 
-    runtime.cxx_library(
-        name = "quantize_per_tensor",
-        srcs = [
-            "quantize_per_tensor.cpp"
-        ],
-        platforms = CXX,
-        deps = [
-            "//executorch/kernels/portable/cpu/util:all_deps",
-            "//executorch/kernels/portable/cpu/pattern:all_deps",
-            "//executorch/runtime/kernel:kernel_includes",
-            "//executorch/kernels/portable/cpu:scalar_utils",
-            "//executorch/backends/cadence/hifi/kernels:kernels",
-            "//executorch/backends/cadence/hifi/third-party/nnlib:nnlib-extensions"
-        ],
-        visibility = [
-            "//executorch/backends/cadence/...",
-            "@EXECUTORCH_CLIENTS",
-        ],
-    )
+def define_operator(name: str, deps: list[str] | None = None) -> None:
+    op_name = "op_{}".format(name)
 
-    runtime.cxx_library(
-        name = "dequantize_per_tensor",
-        srcs = [
-            "dequantize_per_tensor.cpp"
-        ],
-        platforms = CXX,
-        deps = [
-            "//executorch/kernels/portable/cpu/util:all_deps",
-            "//executorch/kernels/portable/cpu/pattern:all_deps",
-            "//executorch/runtime/kernel:kernel_includes",
-            "//executorch/kernels/portable/cpu:scalar_utils",
-            "//executorch/backends/cadence/hifi/kernels:kernels",
-            "//executorch/backends/cadence/hifi/third-party/nnlib:nnlib-extensions"
-        ],
-        visibility = [
-            "//executorch/backends/cadence/...",
-            "@EXECUTORCH_CLIENTS",
-        ],
-    )
+    # Deps used by all operators.
+    common_deps = [
+        "//executorch/kernels/portable/cpu/util:all_deps",
+        "//executorch/kernels/portable/cpu/pattern:all_deps",
+        "//executorch/runtime/kernel:kernel_includes",
+        "//executorch/kernels/portable/cpu:scalar_utils",
+        "//executorch/backends/cadence/hifi/kernels:kernels",
+        "//executorch/kernels/portable/cpu/util:dtype_util",
+        "//executorch/kernels/portable/cpu/util:elementwise_util",
+        "//executorch/backends/cadence/hifi/third-party/nnlib:nnlib-extensions"
+    ]
+    if deps == None:
+        deps = []
 
     runtime.cxx_library(
-        name = "quantized_layer_norm",
-        srcs = [
-            "quantized_layer_norm.cpp"
-        ],
-        exported_headers = ["operators.h"],
+        name = op_name,
+        srcs = [op_name + ".cpp"],
         platforms = CXX,
-        deps = [
-            "//executorch/kernels/portable/cpu/util:all_deps",
-            "//executorch/kernels/portable/cpu/pattern:all_deps",
-            "//executorch/runtime/kernel:kernel_includes",
-            "//executorch/kernels/portable/cpu:scalar_utils",
-            "//executorch/backends/cadence/hifi/kernels:kernels",
-            "//executorch/backends/cadence/hifi/third-party/nnlib:nnlib-extensions"
-        ],
         visibility = [
             "//executorch/backends/cadence/...",
             "@EXECUTORCH_CLIENTS",
         ],
-    )
-
-    runtime.cxx_library(
-        name = "quantized_linear_out",
-        srcs = [
-            "quantized_linear_out.cpp"
-        ],
+        deps = deps + common_deps,
         exported_headers = ["operators.h"],
-        platforms = CXX,
-        deps = [
-            "//executorch/kernels/portable/cpu/util:all_deps",
-            "//executorch/kernels/portable/cpu/pattern:all_deps",
-            "//executorch/runtime/kernel:kernel_includes",
-            "//executorch/kernels/portable/cpu:scalar_utils",
-            "//executorch/backends/cadence/hifi/kernels:kernels",
-            "//executorch/backends/cadence/hifi/third-party/nnlib:nnlib-extensions"
-        ],
-        visibility = [
-            "//executorch/backends/cadence/...",
-            "@EXECUTORCH_CLIENTS",
-        ],
-    )
-
-    runtime.cxx_library(
-        name = "op_add",
-        srcs = [
-            "op_add.cpp",
-        ],
-        platforms = CXX,
-        deps = [
-            "//executorch/kernels/portable/cpu/util:all_deps",
-            "//executorch/kernels/portable/cpu/pattern:all_deps",
-            "//executorch/runtime/kernel:kernel_includes",
-            "//executorch/kernels/portable/cpu:scalar_utils",
-            "//executorch/backends/cadence/hifi/kernels:kernels",
-            "//executorch/backends/cadence/hifi/third-party/nnlib:nnlib-extensions",
-            "//executorch/kernels/portable/cpu/util:dtype_util",
-            "//executorch/kernels/portable/cpu/util:elementwise_util",
-        ],
-        visibility = [
-            "//executorch/backends/cadence/...",
-            "@EXECUTORCH_CLIENTS",
-        ],
-    )
-
-
-    runtime.cxx_library(
-        name = "op_mul",
-        srcs = [
-            "op_mul.cpp",
-        ],
-        platforms = CXX,
-        deps = [
-            "//executorch/kernels/portable/cpu/util:all_deps",
-            "//executorch/kernels/portable/cpu/pattern:all_deps",
-            "//executorch/runtime/kernel:kernel_includes",
-            "//executorch/kernels/portable/cpu:scalar_utils",
-            "//executorch/backends/cadence/hifi/kernels:kernels",
-            "//executorch/kernels/portable/cpu/util:dtype_util",
-            "//executorch/kernels/portable/cpu/util:elementwise_util",
-            "//executorch/backends/cadence/hifi/third-party/nnlib:nnlib-extensions"
-        ],
-        visibility = [
-            "//executorch/backends/cadence/...",
-            "@EXECUTORCH_CLIENTS",
-        ],
-    )
-
-    runtime.cxx_library(
-        name = "op_sub",
-        srcs = [
-            "op_sub.cpp",
-        ],
-        platforms = CXX,
-        deps = [
-            "//executorch/kernels/portable/cpu/util:all_deps",
-            "//executorch/kernels/portable/cpu/pattern:all_deps",
-            "//executorch/runtime/kernel:kernel_includes",
-            "//executorch/kernels/portable/cpu:scalar_utils",
-            "//executorch/backends/cadence/hifi/kernels:kernels",
-            "//executorch/kernels/portable/cpu/util:dtype_util",
-            "//executorch/kernels/portable/cpu/util:elementwise_util",
-            "//executorch/backends/cadence/hifi/third-party/nnlib:nnlib-extensions"
-        ],
-        visibility = [
-            "//executorch/backends/cadence/...",
-            "@EXECUTORCH_CLIENTS",
-        ],
     )
 
-    runtime.cxx_library(
-        name = "op_div",
-        srcs = [
-            "op_div.cpp",
-        ],
-        platforms = CXX,
-        deps = [
-            "//executorch/kernels/portable/cpu/util:all_deps",
-            "//executorch/kernels/portable/cpu/pattern:all_deps",
-            "//executorch/runtime/kernel:kernel_includes",
-            "//executorch/kernels/portable/cpu:scalar_utils",
-            "//executorch/backends/cadence/hifi/kernels:kernels",
-            "//executorch/kernels/portable/cpu/util:dtype_util",
-            "//executorch/kernels/portable/cpu/util:elementwise_util",
-            "//executorch/backends/cadence/hifi/third-party/nnlib:nnlib-extensions"
-        ],
-        visibility = [
-            "//executorch/backends/cadence/...",
-            "@EXECUTORCH_CLIENTS",
-        ],
-    )
+OPERATORS = [
+    "add",
+    "atan2",
+    "cat",
+    "clamp",
+    "dequantize_per_tensor",
+    "div",
+    "full",
+    "maximum",
+    "mean",
+    "minimum",
+    "mul",
+    "permute_copy",
+    "pow",
+    "quantize_per_tensor",
+    "quantized_layer_norm",
+    "quantized_linear_out",
+    "quantized_relu_out",
+    "remainder",
+    "rsqrt",
+    "sigmoid",
+    "softmax",
+    "sub",
+    "tanh",
+    "where"
+]
 
-    runtime.cxx_library(
-        name = "op_sigmoid",
-        srcs = [
-            "op_sigmoid.cpp",
-        ],
-        platforms = CXX,
-        deps = [
-            "//executorch/kernels/portable/cpu/util:all_deps",
-            "//executorch/kernels/portable/cpu/pattern:all_deps",
-            "//executorch/runtime/kernel:kernel_includes",
-            "//executorch/backends/cadence/hifi/kernels:kernels",
-            "//executorch/kernels/portable/cpu/util:dtype_util",
-            "//executorch/kernels/portable/cpu/util:elementwise_util",
-            "//executorch/backends/cadence/hifi/third-party/nnlib:nnlib-extensions"
-        ],
-        visibility = [
-            "//executorch/backends/cadence/...",
-            "@EXECUTORCH_CLIENTS",
-        ],
-    )
+def define_common_targets():
+    """Defines targets that should be shared between fbcode and xplat.
 
-    runtime.cxx_library(
-        name = "op_tanh",
-        srcs = [
-            "op_tanh.cpp",
-        ],
-        platforms = CXX,
-        deps = [
-            "//executorch/kernels/portable/cpu/util:all_deps",
-            "//executorch/kernels/portable/cpu/pattern:all_deps",
-            "//executorch/runtime/kernel:kernel_includes",
-            "//executorch/backends/cadence/hifi/kernels:kernels",
-            "//executorch/backends/cadence/hifi/third-party/nnlib:nnlib-extensions"
-        ],
-        visibility = [
-            "//executorch/backends/cadence/...",
-            "@EXECUTORCH_CLIENTS",
-        ],
-    )
+    The directory containing this targets.bzl file should also contain both
+    TARGETS and BUCK files that call this function.
+    """
 
-    
-    runtime.cxx_library(
-        name = "op_where",
-        srcs = [
-            "op_where.cpp",
-        ],
-        platforms = CXX,
-        deps = [
-            "//executorch/kernels/portable/cpu/util:all_deps",
-            "//executorch/kernels/portable/cpu/pattern:all_deps",
-            "//executorch/runtime/kernel:kernel_includes",
-            "//executorch/backends/cadence/hifi/kernels:kernels",
-            "//executorch/kernels/portable/cpu/util:elementwise_util",
-            "//executorch/backends/cadence/hifi/third-party/nnlib:nnlib-extensions"
-        ],
-        visibility = [
-            "//executorch/backends/cadence/...",
-            "@EXECUTORCH_CLIENTS",
-        ],
-    )
+    # Define build targets for all operators registered in the tables above.
+    for op in OPERATORS:
+        define_operator(op)
diff --git a/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_minimum_maximum_f32.c b/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_minimum_maximum_f32.c
index 50d24c8bae..7d95e536c9 100644
--- a/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_minimum_maximum_f32.c
+++ b/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_minimum_maximum_f32.c
@@ -843,4 +843,3 @@ WORD32 xa_nn_elm_minimum_broadcast_4D_f32xf32_f32(FLOAT32 * __restrict__ p_out,
 }
 
 #endif
-
diff --git a/backends/cadence/hifi/third-party/nnlib/xa_nn_transpose_8.c b/backends/cadence/hifi/third-party/nnlib/xa_nn_transpose_8.c
new file mode 100644
index 0000000000..b069035dc9
--- /dev/null
+++ b/backends/cadence/hifi/third-party/nnlib/xa_nn_transpose_8.c
@@ -0,0 +1,232 @@
+/*******************************************************************************
+* Copyright (c) 2018-2024 Cadence Design Systems, Inc.
+*
+* Permission is hereby granted, free of charge, to any person obtaining
+* a copy of this software and associated documentation files (the
+* "Software"), to use this Software with Cadence processor cores only and
+* not with any other processors and platforms, subject to
+* the following conditions:
+*
+* The above copyright notice and this permission notice shall be included
+* in all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+******************************************************************************/
+#include "xa_nnlib_common.h"
+
+#include <string.h>
+
+/*
+ * Currently only supports upto 5D input tensors.
+ * 1/2/3/4 D input tensors will be scaled up to 5D.
+ * For example, 2x3 -> 1x1x1x2x3.
+ */
+
+WORD32 xa_nn_transpose_8_8(WORD8 * __restrict__ p_out
+                    ,const WORD32 *const p_out_shape
+                    ,const WORD8 * __restrict__ p_inp
+                    ,const WORD32 *const p_inp_shape
+                    ,const WORD32 * __restrict__ p_permute_vec
+                    ,WORD32 num_out_dims
+                    ,WORD32 num_inp_dims)
+{
+  /* NULL pointer checks */
+  XA_NNLIB_ARG_CHK_PTR(p_out, -1);
+  XA_NNLIB_ARG_CHK_PTR(p_inp, -1);
+  XA_NNLIB_ARG_CHK_PTR(p_permute_vec, -1);
+  XA_NNLIB_ARG_CHK_PTR(p_out_shape, -1);
+  XA_NNLIB_ARG_CHK_PTR(p_inp_shape, -1);
+
+  /* Invalid input checks */
+  XA_NNLIB_ARG_CHK_COND(((num_inp_dims <= 0) || (num_inp_dims > 5)), -1);
+  XA_NNLIB_ARG_CHK_COND((num_out_dims != num_inp_dims), -1);
+
+  int itr = 0;
+  for(itr=0; itr < num_inp_dims; itr++)
+  {
+    XA_NNLIB_ARG_CHK_COND((p_inp_shape[itr] <= 0), -1);
+  }
+  for(itr=0; itr < num_out_dims; itr++)
+  {
+    XA_NNLIB_ARG_CHK_COND((p_out_shape[itr] <= 0), -1);
+  }
+
+  /* Output shape provided must be correct based on input
+   * shape and permute values */
+  for(itr=0; itr < num_out_dims; itr++)
+  {
+    int output_dim = p_out_shape[itr];
+    int expected_dim = p_inp_shape[p_permute_vec[itr]];
+    XA_NNLIB_ARG_CHK_COND((output_dim != expected_dim), -1);
+  }
+
+  /* Pointer alignment checks */
+  XA_NNLIB_ARG_CHK_ALIGN(p_out, sizeof(WORD8), -1);
+  XA_NNLIB_ARG_CHK_ALIGN(p_inp, sizeof(WORD8), -1);
+  XA_NNLIB_ARG_CHK_ALIGN(p_permute_vec, sizeof(WORD32), -1);
+  XA_NNLIB_ARG_CHK_ALIGN(p_out_shape, sizeof(WORD32), -1);
+  XA_NNLIB_ARG_CHK_ALIGN(p_inp_shape, sizeof(WORD32), -1);
+
+  /* Shift all dim with 1 in the outer part */
+  int eff_output_shape[5];
+  int eff_permute_vec[5];
+
+  for(int i = 0; i < num_out_dims; i++)
+  {
+    eff_output_shape[i] = p_out_shape[i];
+    eff_permute_vec[i] = p_permute_vec[i];
+  }
+  
+  int one_i=num_out_dims-1, non_one_i=num_out_dims-1;
+  while(one_i > 0 && non_one_i >=0){
+    while(one_i > 0 && eff_output_shape[one_i]!=1){
+      one_i--;
+    }
+    non_one_i = one_i;
+    while(non_one_i >= 0 && eff_output_shape[non_one_i]==1)
+    {
+      non_one_i--;
+    }
+    if(one_i > 0 && non_one_i >=0){
+      int temp;
+      /*swap output_shape*/
+      {
+        temp = eff_output_shape[one_i];
+        eff_output_shape[one_i] = eff_output_shape[non_one_i];
+        eff_output_shape[non_one_i] = temp;
+      }
+      /*swap permute_vec*/
+      {
+        temp = eff_permute_vec[one_i];
+        eff_permute_vec[one_i] = eff_permute_vec[non_one_i];
+        eff_permute_vec[non_one_i] = temp;
+      }
+      
+    }
+  }
+
+
+  /* Promoting lesser dim tensors to 5D tensors. 
+   * Also updating the permute_vec and shapes as needed for optimization */
+  int p_5D_inp_shape[5] = {1, 1, 1, 1, 1};
+  int p_5D_out_shape[5] = {1, 1, 1, 1, 1};
+  int p_5D_permute_vec[5] = {0, 1, 2, 3, 4};
+  
+  /* Check if any inner inp dimension is same in the output */
+  int last_dim_same = 1, last_n_same_dim = 0;
+  itr = num_inp_dims - 1;
+  while(itr >= 0)
+  {
+    last_n_same_dim = (last_dim_same && (eff_permute_vec[itr] == itr)) ? (last_n_same_dim + 1) : last_n_same_dim;
+    last_dim_same = (eff_permute_vec[itr] == itr) ? last_dim_same & 1 : last_dim_same & 0;
+    itr--;
+  }
+  
+  int dims_added = 5 - num_inp_dims;
+  itr = num_inp_dims - 1;
+  int same_count = last_n_same_dim;
+  int count = 4;
+  while(itr >= 0)
+  {
+    p_5D_inp_shape[count] = (same_count > 0) ? p_5D_inp_shape[count]*p_inp_shape[itr] : p_inp_shape[itr];
+    p_5D_out_shape[count] = (same_count > 0) ? p_5D_out_shape[count]*eff_output_shape[itr] : eff_output_shape[itr];
+    same_count--;
+    itr--;
+    count = (same_count > 0) ? count : count - 1;
+  }
+  
+  itr = num_inp_dims - 1;
+  same_count = (last_n_same_dim) ? num_inp_dims - (last_n_same_dim - 1) : 0;
+  count = 4;
+  while(itr >= 0)
+  {
+    p_5D_permute_vec[count] = (same_count > 0) ? eff_permute_vec[itr-(last_n_same_dim - 1)] + dims_added + last_n_same_dim - 1 : eff_permute_vec[itr] + dims_added;
+    same_count--;
+    itr--;
+    count--;
+  }
+  
+  int out_dim0, out_dim1, out_dim2, out_dim3, out_dim4;
+  int inp_dim1, inp_dim2, inp_dim3, inp_dim4;
+  int inp_stride[5];
+
+  out_dim0 = p_5D_out_shape[0]; 
+  out_dim1 = p_5D_out_shape[1]; 
+  out_dim2 = p_5D_out_shape[2]; 
+  out_dim3 = p_5D_out_shape[3];
+  out_dim4 = p_5D_out_shape[4];
+
+  inp_dim1 = p_5D_inp_shape[1]; 
+  inp_dim2 = p_5D_inp_shape[2]; 
+  inp_dim3 = p_5D_inp_shape[3];
+  inp_dim4 = p_5D_inp_shape[4];
+
+  inp_stride[0] = inp_dim1*inp_dim2*inp_dim3*inp_dim4;
+  inp_stride[1] = inp_dim2*inp_dim3*inp_dim4;
+  inp_stride[2] = inp_dim3*inp_dim4;
+  inp_stride[3] = inp_dim4;
+  inp_stride[4] = 1;
+
+  if(last_n_same_dim)
+  {
+    int itr0, itr1, itr2, itr3;
+    WORD8 *p_inp0 = (WORD8*)p_inp;
+    for(itr0 = 0; itr0 < out_dim0; itr0++)
+    {
+      WORD8 *p_inp1 = p_inp0+(itr0*inp_stride[p_5D_permute_vec[0]]);
+#pragma loop_count min=1
+      for(itr1 = 0; itr1 < out_dim1; itr1++)
+      {
+        WORD8 *p_inp2 = p_inp1+(itr1*inp_stride[p_5D_permute_vec[1]]);
+#pragma loop_count min=1
+        for(itr2 = 0; itr2 < out_dim2; itr2++)
+        {
+          WORD8 *p_inp3 = p_inp2+(itr2*inp_stride[p_5D_permute_vec[2]]);
+#pragma loop_count min=1
+          for(itr3 = 0; itr3 < out_dim3; itr3++, p_out+=out_dim4)
+          {
+            WORD8 *p_inp4 = p_inp3+(itr3*inp_stride[p_5D_permute_vec[3]]);
+            memcpy(p_out, p_inp4, out_dim4);
+          }
+        }
+      }
+    }
+  }
+  else
+  {
+    int itr0, itr1, itr2, itr3, itr4;
+    WORD8 *p_inp0 = (WORD8*)p_inp;
+    for(itr0 = 0; itr0 < out_dim0; itr0++)
+    {
+      WORD8 *p_inp1 = p_inp0+(itr0*inp_stride[p_5D_permute_vec[0]]);
+      for(itr1 = 0; itr1 < out_dim1; itr1++)
+      {
+        WORD8 *p_inp2 = p_inp1+(itr1*inp_stride[p_5D_permute_vec[1]]);
+        for(itr2 = 0; itr2 < out_dim2; itr2++)
+        {
+          WORD8 *p_inp3 = p_inp2+(itr2*inp_stride[p_5D_permute_vec[2]]);
+          for(itr3 = 0; itr3 < out_dim3; itr3++)
+          {
+            WORD8 *p_inp4 = p_inp3+(itr3*inp_stride[p_5D_permute_vec[3]]);
+            for(itr4 = 0; itr4 < out_dim4; itr4++)
+            {
+              WORD8 d0 = *(p_inp4);
+              p_inp4 += inp_stride[p_5D_permute_vec[4]];
+              *p_out++ = d0;
+
+            }
+          }
+        }
+      }
+    }
+  }
+
+  return 0;
+}
diff --git a/backends/qualcomm/CMakeLists.txt b/backends/qualcomm/CMakeLists.txt
index 3c66796594..bc0f51a236 100644
--- a/backends/qualcomm/CMakeLists.txt
+++ b/backends/qualcomm/CMakeLists.txt
@@ -1,4 +1,5 @@
 # Copyright (c) Qualcomm Innovation Center, Inc.
+# Copyright 2025 Arm Limited and/or its affiliates.
 # All rights reserved
 #
 # This source code is licensed under the BSD-style license found in the
@@ -199,11 +200,6 @@ target_link_libraries(
 #
 target_link_options_shared_lib(qnn_executorch_backend)
 
-#
-# add compile option
-#
-target_compile_options(executorch PUBLIC -DET_EVENT_TRACER_ENABLED)
-
 #
 # add sources
 #
diff --git a/backends/qualcomm/_passes/__init__.py b/backends/qualcomm/_passes/__init__.py
new file mode 100644
index 0000000000..de4b7ce2cc
--- /dev/null
+++ b/backends/qualcomm/_passes/__init__.py
@@ -0,0 +1,34 @@
+from .annotate_and_quant_scalar import AnnotateAndQuantScalar
+from .annotate_decomposed import AnnotateDecomposed
+from .annotate_quant_attrs import AnnotateQuantAttrs
+from .convert_bmm_to_matmul import ConvertBmmToMatmul
+from .convert_interpolate_with_upsample2d import ConvertInterpolateWithUpsample2D
+from .convert_prelu import ConvertPReLU
+from .convert_to_linear import ConvertToLinear
+from .expand_broadcast_tensor_shape import ExpandBroadcastTensorShape
+from .fold_qdq import FoldQDQ
+from .i64_to_i32 import I64toI32
+from .layout_transform import LayoutTransform
+from .recompose_pixel_unshuffle import RecomposePixelUnshuffle
+from .recompose_rms_norm import RecomposeRmsNorm
+from .remove_redundancy import RemoveRedundancy
+from .replace_index_put_input import ReplaceIndexPutInput
+
+
+__all__ = [
+    AnnotateAndQuantScalar,
+    AnnotateDecomposed,
+    AnnotateQuantAttrs,
+    ConvertBmmToMatmul,
+    ConvertInterpolateWithUpsample2D,
+    ConvertPReLU,
+    ConvertToLinear,
+    ExpandBroadcastTensorShape,
+    FoldQDQ,
+    I64toI32,
+    LayoutTransform,
+    RecomposePixelUnshuffle,
+    RecomposeRmsNorm,
+    RemoveRedundancy,
+    ReplaceIndexPutInput,
+]
diff --git a/backends/qualcomm/_passes/annotate_and_quant_scalar.py b/backends/qualcomm/_passes/annotate_and_quant_scalar.py
index 1db50694ec..86475c39b1 100644
--- a/backends/qualcomm/_passes/annotate_and_quant_scalar.py
+++ b/backends/qualcomm/_passes/annotate_and_quant_scalar.py
@@ -53,7 +53,9 @@ def _get_source_scalar_node(self, node: torch.fx.Node) -> torch.fx.Node:
         if node.op == "placeholder":
             if not (shape := node.meta["val"].size()):
                 return node
-            assert f"The output of node {node} is not a scalar, but a tensor with shape {shape}"
+            assert (
+                not shape
+            ), f"The output of node {node} is not a scalar, but a tensor with shape {shape}"
         return self._get_source_scalar_node(node.args[0])
 
     def _update_scalar_node_attrs(self, node: torch.fx.Node, quant_attrs: Dict) -> Dict:
diff --git a/backends/qualcomm/_passes/fuse_consecutive_transpose.py b/backends/qualcomm/_passes/fuse_consecutive_transpose.py
index c81818e00e..16ce380307 100644
--- a/backends/qualcomm/_passes/fuse_consecutive_transpose.py
+++ b/backends/qualcomm/_passes/fuse_consecutive_transpose.py
@@ -15,8 +15,18 @@
 
 class FuseConsecutiveTranspose(ExportPass):
     """
-    This pass fuses consecutive transpose / permute into one to reduce runtime
-    overhead
+    This pass fuses consecutive transpose / permute into one or none to reduce runtime
+    overhead.
+    To simplify the fuse logic, we ensure each permute node's output has at most 1 permute node
+    by cloning transpose.
+    Example:
+    Before clone transpose:
+    relu -> permute1 ─> permute2
+               |──────> permute3
+
+    After clone transpose:
+    relu ─> permute1 ──────> permute2
+      |───> permute4(new) ─> permute3
     """
 
     def __init__(self):
@@ -27,6 +37,30 @@ def __init__(self):
         self.visited = set()
         self.nodes = []
 
+    def _clone_transpose(
+        self, graph_module: torch.fx.GraphModule
+    ) -> torch.fx.GraphModule:
+        graph = graph_module.graph
+        for n in graph_module.graph.nodes:
+            if n.target in self.op_map:
+                users = [user for user in list(n.users) if user.target in self.op_map]
+                if len(users) > 1:
+                    for i in range(1, len(users)):
+                        with graph.inserting_after(n):
+                            clone_permute_node = graph.create_node(
+                                "call_function",
+                                exir_ops.edge.aten.permute_copy.default,
+                                (n.args[0], n.args[1]),
+                            )
+                            clone_permute_node.meta = n.meta
+                            users[i].replace_input_with(n, clone_permute_node)
+
+    def _is_dispensable(self, axis_order):
+        for index, value in enumerate(axis_order):
+            if index != value:
+                return False
+        return True
+
     def _traverse(self, node):
         if node in self.visited or node.target not in self.op_map:
             return
@@ -34,47 +68,50 @@ def _traverse(self, node):
         self.nodes.append(node)
         self.visited.add(node)
         next_users = [n for n in list(node.users) if n.target in self.op_map]
+
+        assert (
+            len(next_users) <= 1
+        ), "Each permute node should have at most 1 permute output node after _clone_transpose"
         if not next_users:
             return
-
-        if len(next_users) == 1:
-            self._traverse(list(node.users)[0])
         else:
-            raise NotImplementedError(
-                f"Check the node {node}, wich encounter mutilple permute output case"
-            )
+            self._traverse(list(node.users)[0])
 
     def _fuse(self, graph_module: torch.fx.GraphModule) -> torch.fx.GraphModule:
         graph = graph_module.graph
         for n in graph_module.graph.nodes:
             self._traverse(n)
             if len(self.nodes) > 1:
-                permute_order = []
                 input_node, output_node = self.nodes[0].args[0], self.nodes[-1]
                 input_shape = input_node.meta["val"].shape
                 axis_order = torch.arange(len(input_shape)).tolist()
                 for node in self.nodes:
-                    permute_order.append(node.args[1])
                     axis_order = [axis_order[i] for i in node.args[1]]
-                with graph.inserting_after(input_node):
-                    permute_op = exir_ops.edge.aten.permute_copy.default
-                    permute_node = graph.create_node(
-                        "call_function", permute_op, (input_node, axis_order)
-                    )
-                    users = output_node.users.copy()
-                    for user in users:
-                        user.replace_input_with(output_node, permute_node)
-
-                    # copy metadata
-                    permute_node.meta = output_node.meta
-                    # Without "qnn_permute", we might obtain wrong input shape
-                    if [pn.meta.get(QCOM_INSERTED_PERMUTE) for pn in self.nodes]:
-                        permute_node.meta[QCOM_INSERTED_PERMUTE] = True
+                # If axis order is just [0,1,2,3], we ignore permute node
+                if self._is_dispensable(axis_order):
+                    for user in output_node.users.copy():
+                        user.replace_input_with(output_node, n.args[0])
+                else:
+                    with graph.inserting_after(input_node):
+                        permute_op = exir_ops.edge.aten.permute_copy.default
+                        permute_node = graph.create_node(
+                            "call_function", permute_op, (input_node, axis_order)
+                        )
+                        users = output_node.users.copy()
+                        for user in users:
+                            user.replace_input_with(output_node, permute_node)
+
+                        # copy metadata
+                        permute_node.meta = output_node.meta
+                        # Without "qnn_permute", we might obtain wrong input shape
+                        if [pn.meta.get(QCOM_INSERTED_PERMUTE) for pn in self.nodes]:
+                            permute_node.meta[QCOM_INSERTED_PERMUTE] = True
 
             # clear current stack
             self.nodes = []
 
     def call(self, graph_module: torch.fx.GraphModule):
+        self._clone_transpose(graph_module)
         self._fuse(graph_module)
         graph_module.recompile()
         dead_code_elimination_pass(graph_module)
diff --git a/backends/qualcomm/_passes/i64_to_i32.py b/backends/qualcomm/_passes/i64_to_i32.py
index 1d2171cc37..29c747d1a1 100644
--- a/backends/qualcomm/_passes/i64_to_i32.py
+++ b/backends/qualcomm/_passes/i64_to_i32.py
@@ -3,6 +3,8 @@
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
+from typing import FrozenSet
+
 import torch
 from executorch.backends.qualcomm.builders.utils import get_parameter, is_constant
 from executorch.exir.dialects._ops import ops as exir_ops
@@ -15,9 +17,14 @@ class I64toI32(ExportPass):
     Cast unsupported int64 datatype into int32.
     """
 
-    def __init__(self, edge_program: torch.export.ExportedProgram):
+    def __init__(
+        self,
+        edge_program: torch.export.ExportedProgram,
+        skip_node: FrozenSet[str] = frozenset(),
+    ):
         super(I64toI32, self).__init__()
         self.edge_program = edge_program
+        self.skip_node = skip_node
         # pyre-ignore[4]
         self.copy_op = exir_ops.edge.aten._to_copy.default
 
@@ -42,6 +49,8 @@ def _is_tensor_of_dtype(self, node_val, dtype: torch.dtype) -> bool:
 
     def _cast_to_int32(self, graph_module: torch.fx.GraphModule):
         for n in graph_module.graph.nodes:
+            if n.target in self.skip_node:
+                continue
             if is_constant(n, self.edge_program):
                 param = get_parameter(n, self.edge_program)
                 if param.dtype == torch.int64:
diff --git a/backends/qualcomm/_passes/insert_requantize.py b/backends/qualcomm/_passes/insert_requantize.py
index 11aad02a0c..83b729f3c4 100644
--- a/backends/qualcomm/_passes/insert_requantize.py
+++ b/backends/qualcomm/_passes/insert_requantize.py
@@ -89,15 +89,9 @@ def _single_output_annotation(
         requantize_dict = n.meta.pop(QCOM_REQUANTIZE)
         # {quant_attr: user_node_name_list}
         group_quant_attr_dict = self._invert_dict(requantize_dict)
-        # TODO: If users of the node contain output node,
-        # we replace the node with to_copy op. However, it would
-        # be problem when the node has multiple to_copy ops
-        add_output = len(group_quant_attr_dict) == 1
 
         for hashable_quant_attr, user_nodes in group_quant_attr_dict.items():
             user_nodes_copy = user_nodes.copy()
-            if add_output:
-                user_nodes_copy.append("output")
             self._insert_to_copy(gm, n, dict(hashable_quant_attr), user_nodes_copy)
 
     def _insert(self, graph_module: torch.fx.GraphModule) -> torch.fx.GraphModule:
diff --git a/backends/qualcomm/_passes/layout_transform.py b/backends/qualcomm/_passes/layout_transform.py
index 098910ed86..ccc34d3a52 100644
--- a/backends/qualcomm/_passes/layout_transform.py
+++ b/backends/qualcomm/_passes/layout_transform.py
@@ -30,6 +30,7 @@ class LayoutTransform(ExportPass):
     """
 
     layout_sensitive_ops = {
+        exir_ops.edge.aten.adaptive_avg_pool2d.default,
         exir_ops.edge.aten.avg_pool2d.default,
         exir_ops.edge.aten.convolution.default,
         exir_ops.edge.aten.max_pool2d_with_indices.default,
diff --git a/backends/qualcomm/_passes/utils.py b/backends/qualcomm/_passes/utils.py
index ac6525ae76..a606a21c62 100755
--- a/backends/qualcomm/_passes/utils.py
+++ b/backends/qualcomm/_passes/utils.py
@@ -43,3 +43,63 @@ def get_quant_attrs(
 
     quant_attrs[QCOM_ENCODING] = quant_node.target
     return quant_attrs
+
+
+def get_passes_dependency_for_capture_program():
+    """
+    This function records the dependencies for passes used in the capture_program.
+
+    It returns a dictionary where the keys are pass classes and the values are lists of
+    dependencies required by each pass. This helps in managing and organizing the sequence
+    of passes needed for the capture_program to function correctly.
+
+    Returns:
+        dict: A dictionary mapping each pass to its corresponding list of dependencies.
+    """
+    from executorch.backends.qualcomm._passes import (
+        AnnotateAndQuantScalar,
+        AnnotateDecomposed,
+        AnnotateQuantAttrs,
+        ConvertBmmToMatmul,
+        ConvertInterpolateWithUpsample2D,
+        ConvertPReLU,
+        ConvertToLinear,
+        ExpandBroadcastTensorShape,
+        FoldQDQ,
+        I64toI32,
+        LayoutTransform,
+        RecomposePixelUnshuffle,
+        RecomposeRmsNorm,
+        RemoveRedundancy,
+        ReplaceIndexPutInput,
+    )
+
+    return {
+        RecomposePixelUnshuffle: [RemoveRedundancy],
+        RecomposeRmsNorm: [RemoveRedundancy],
+        ConvertToLinear: [RecomposePixelUnshuffle],
+        ConvertPReLU: [RemoveRedundancy],
+        ConvertBmmToMatmul: [ConvertToLinear],
+        ConvertInterpolateWithUpsample2D: [RemoveRedundancy],
+        I64toI32: [RemoveRedundancy],
+        AnnotateQuantAttrs: [
+            RecomposePixelUnshuffle,
+            RecomposeRmsNorm,
+            ConvertToLinear,
+            ConvertPReLU,
+            ConvertBmmToMatmul,
+            ConvertInterpolateWithUpsample2D,
+        ],
+        AnnotateAndQuantScalar: [
+            AnnotateQuantAttrs,
+        ],
+        AnnotateDecomposed: [RemoveRedundancy],
+        FoldQDQ: [AnnotateQuantAttrs, AnnotateAndQuantScalar, AnnotateDecomposed],
+        ExpandBroadcastTensorShape: [RemoveRedundancy],
+        LayoutTransform: [
+            AnnotateQuantAttrs,
+            AnnotateAndQuantScalar,
+            ExpandBroadcastTensorShape,
+        ],
+        ReplaceIndexPutInput: [LayoutTransform],
+    }
diff --git a/backends/qualcomm/builders/__init__.py b/backends/qualcomm/builders/__init__.py
index 61ed30679e..7a4d6d764b 100644
--- a/backends/qualcomm/builders/__init__.py
+++ b/backends/qualcomm/builders/__init__.py
@@ -7,6 +7,7 @@
 from . import (
     node_visitor,
     op_abs,
+    op_adaptive_avg_pool2d,
     op_add,
     op_arange,
     op_avg_pool2d,
@@ -78,6 +79,7 @@
 __all__ = [
     node_visitor,
     op_abs,
+    op_adaptive_avg_pool2d,
     op_add,
     op_arange,
     op_avg_pool2d,
diff --git a/backends/qualcomm/builders/op_adaptive_avg_pool2d.py b/backends/qualcomm/builders/op_adaptive_avg_pool2d.py
new file mode 100644
index 0000000000..c944e1646e
--- /dev/null
+++ b/backends/qualcomm/builders/op_adaptive_avg_pool2d.py
@@ -0,0 +1,125 @@
+# Copyright (c) Qualcomm Innovation Center, Inc.
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+import warnings
+from typing import Dict
+
+import executorch.backends.qualcomm.python.PyQnnWrapperAdaptor as PyQnnWrapper
+import numpy as np
+
+import torch
+
+from .node_visitor import NodeVisitor, register_node_visitor
+from .qnn_constants import OpPoolAvg2d, QNN_OP_PACKAGE_NAME_QTI_AISW
+
+
+@register_node_visitor
+class AdaptiveAvgPool2D(NodeVisitor):
+    target = ["aten.adaptive_avg_pool2d.default"]
+
+    def __init__(self, *args) -> None:
+        super().__init__(*args)
+
+    def define_node(
+        self,
+        node: torch.fx.Node,
+        nodes_to_wrappers: Dict[torch.fx.Node, PyQnnWrapper.TensorWrapper],
+    ) -> PyQnnWrapper.PyQnnOpWrapper:
+
+        input_node = node.args[0]
+        input_tensor = self.get_tensor(input_node, node)
+        input_tensor_wrapper = self.define_tensor(
+            input_node,
+            node,
+            input_tensor,
+            PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_NATIVE,
+            nodes_to_wrappers,
+        )
+
+        input_height = input_tensor.shape[1]
+        input_width = input_tensor.shape[2]
+
+        output_height = node.args[1][0]
+        output_width = node.args[1][1]
+
+        filter_height = input_height // output_height
+        filter_width = input_width // output_width
+        filter = [filter_height, filter_width]
+        filter_shape = [len(filter)]
+
+        stride_height = filter_height
+        stride_width = filter_width
+        stride = [stride_height, stride_width]
+        stride_shape = [len(stride)]
+
+        height = (output_height - 1) * stride_height + filter_height - input_height
+        width = (output_width - 1) * stride_width + filter_width - input_width
+        if height % 2 != 0 or width % 2 != 0:
+            warnings.warn(
+                "[QNN Delegate Op Builder]: Height or Width is not divisble by 2 with no remainder, fall back op",
+                stacklevel=1,
+            )
+            return
+
+        padding_height = height / 2
+        padding_width = width / 2
+        padding = [padding_height, padding_width]
+        padding_shape = [2, 2]
+
+        out_tensor = self.get_tensor(node, node)
+        output_tensor_wrapper = self.define_tensor(
+            node,
+            node,
+            out_tensor,
+            PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_NATIVE,
+            nodes_to_wrappers,
+        )
+
+        adaptive_avg_pool2d_op = PyQnnWrapper.PyQnnOpWrapper(
+            node.name,
+            QNN_OP_PACKAGE_NAME_QTI_AISW,
+            OpPoolAvg2d.op_name,
+        )
+
+        adaptive_avg_pool2d_op.AddInputTensors([input_tensor_wrapper])
+        adaptive_avg_pool2d_op.AddOutputTensors([output_tensor_wrapper])
+
+        adaptive_avg_pool2d_op.AddTensorParam(
+            OpPoolAvg2d.param_filter_size,
+            PyQnnWrapper.Qnn_DataType_t.QNN_DATATYPE_UINT_32,
+            len(filter_shape),
+            filter_shape,
+            np.array(
+                filter,
+                dtype=np.uint32,
+            ),
+            True,
+        )
+
+        adaptive_avg_pool2d_op.AddTensorParam(
+            OpPoolAvg2d.param_stride,
+            PyQnnWrapper.Qnn_DataType_t.QNN_DATATYPE_UINT_32,
+            len(stride_shape),
+            stride_shape,
+            np.array(
+                stride,
+                dtype=np.uint32,
+            ),
+            True,
+        )
+
+        adaptive_avg_pool2d_op.AddTensorParam(
+            OpPoolAvg2d.param_pad_amount,
+            PyQnnWrapper.Qnn_DataType_t.QNN_DATATYPE_UINT_32,
+            len(padding_shape),
+            padding_shape,
+            np.array(
+                [[padding[0], padding[0]], [padding[1], padding[1]]],
+                dtype=np.uint32,
+            ),
+            True,
+        )
+
+        return adaptive_avg_pool2d_op
diff --git a/backends/qualcomm/builders/op_layer_norm.py b/backends/qualcomm/builders/op_layer_norm.py
index 2006c71648..06f822014e 100644
--- a/backends/qualcomm/builders/op_layer_norm.py
+++ b/backends/qualcomm/builders/op_layer_norm.py
@@ -63,15 +63,19 @@ def define_node(
             nodes_to_wrappers,
         )
 
+        layer_norm_input_tensors = [input_tensor_wrapper, weight_tensor_wrapper]
+
         bias_node = node.args[3]
-        bias_tensor = get_parameter(bias_node, self.edge_program)
-        bias_tensor_wrapper = self.define_tensor(
-            bias_node,
-            node,
-            bias_tensor,
-            PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_STATIC,
-            nodes_to_wrappers,
-        )
+        if bias_node is not None:
+            bias_tensor = get_parameter(bias_node, self.edge_program)
+            bias_tensor_wrapper = self.define_tensor(
+                bias_node,
+                node,
+                bias_tensor,
+                PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_STATIC,
+                nodes_to_wrappers,
+            )
+            layer_norm_input_tensors.append(bias_tensor_wrapper)
 
         epsilon = node.args[4]
 
@@ -89,9 +93,7 @@ def define_node(
             QNN_OP_PACKAGE_NAME_QTI_AISW,
             OpLayerNorm.op_name,
         )
-        layer_norm_op.AddInputTensors(
-            [input_tensor_wrapper, weight_tensor_wrapper, bias_tensor_wrapper]
-        )
+        layer_norm_op.AddInputTensors(layer_norm_input_tensors)
         layer_norm_op.AddOutputTensors([output_tensor_wrapper])
         layer_norm_op.AddScalarParam(
             OpLayerNorm.param_epsilon,
diff --git a/backends/qualcomm/builders/op_rms_norm.py b/backends/qualcomm/builders/op_rms_norm.py
index d1daa6c1e5..e5b4778312 100644
--- a/backends/qualcomm/builders/op_rms_norm.py
+++ b/backends/qualcomm/builders/op_rms_norm.py
@@ -66,7 +66,7 @@ def define_node(
             nodes_to_wrappers,
         )
 
-        # Fake node, nn moudle seems to be inconsistant with document
+        # Fake node, nn module seems to be inconsistant with document
         bias_tensor = torch.zeros(weight_tensor.shape)
         bias_node = torch.fx.Node(
             node.graph,
diff --git a/backends/qualcomm/quantizer/annotators.py b/backends/qualcomm/quantizer/annotators.py
index e1792cb183..8bf2265fb5 100644
--- a/backends/qualcomm/quantizer/annotators.py
+++ b/backends/qualcomm/quantizer/annotators.py
@@ -512,6 +512,11 @@ def annotate_sqrt(node: Node, quantization_config: QuantizationConfig) -> None:
     annotate_single_in_single_out(node, quantization_config)
 
 
+@register_annotator([torch.ops.aten.square.default])
+def annotate_square(node: Node, quantization_config: QuantizationConfig) -> None:
+    annotate_single_in_single_out(node, quantization_config)
+
+
 @register_annotator([torch.ops.aten.gelu.default])
 def annotate_gelu(node: Node, quantization_config: QuantizationConfig) -> None:
     annotate_single_in_single_out(node, quantization_config)
diff --git a/backends/qualcomm/quantizer/custom_annotation.py b/backends/qualcomm/quantizer/custom_annotation.py
index d1c1757cc1..33237f3beb 100644
--- a/backends/qualcomm/quantizer/custom_annotation.py
+++ b/backends/qualcomm/quantizer/custom_annotation.py
@@ -14,17 +14,80 @@
     QuantizationConfig,
 )
 from executorch.exir.dialects._ops import ops as exir_ops
-from torch.ao.quantization.observer import MinMaxObserver
+from torch.ao.quantization.observer import FixedQParamsObserver, MinMaxObserver
 from torch.ao.quantization.quantizer import (
     QuantizationAnnotation,
+    QuantizationSpec,
     SharedQuantizationSpec,
 )
 from torch.fx import Node
 
 
-def annotate_matmul_16a8w(  # noqa: C901
-    gm: torch.fx.GraphModule, traverse_input1=True
-) -> None:
+def annotate_linear_16a8w_in_affine_layer(gm: torch.fx.GraphModule) -> None:
+    def annotate_conv2d(node: Node, quantization_config: QuantizationConfig) -> None:
+        input_qspec_map = {}
+        input_act = node.args[0]
+        input_spec = quantization_config.input_activation
+        input_qspec_map[input_act] = input_spec
+
+        weight = node.args[1]
+        input_qspec_map[weight] = quantization_config.weight
+
+        node.meta[QUANT_ANNOTATION_KEY] = QuantizationAnnotation(
+            input_qspec_map=input_qspec_map,
+            output_qspec=quantization_config.output_activation,
+            _annotated=True,
+        )
+
+    quantization_config_16a8w_per_channel = get_ptq_per_channel_quant_config(
+        torch.uint16, weight_dtype=torch.int8, act_observer=MinMaxObserver
+    )
+    for node in gm.graph.nodes:
+        if node.op == "call_function" and node.target == torch.ops.aten.conv2d.default:
+            if "nn_module_stack" in node.meta:
+                module_values_list = list(node.meta["nn_module_stack"].values())
+                full_qualified_name = module_values_list[-1][0]
+                if full_qualified_name == "output.conv":
+                    annotate_conv2d(
+                        node, quantization_config=quantization_config_16a8w_per_channel
+                    )
+
+
+def annotate_prefill_kv_output(gm: torch.fx.GraphModule, kv_quant_attrs: dict):
+    for node in gm.graph.nodes:
+        if node.op == "output":
+            for index, prefill_output in enumerate(node.args[0]):
+                kv_quant_attr = kv_quant_attrs[index]
+                fixed_observer = FixedQParamsObserver.with_args(
+                    scale=kv_quant_attr[0],
+                    zero_point=kv_quant_attr[1],
+                    quant_min=kv_quant_attr[2],
+                    quant_max=kv_quant_attr[3],
+                    dtype=kv_quant_attr[4],
+                    qscheme=torch.torch.per_tensor_affine,
+                )
+
+                fixed_output_spec = QuantizationSpec(
+                    quant_min=kv_quant_attr[2],
+                    quant_max=kv_quant_attr[3],
+                    dtype=kv_quant_attr[4],
+                    ch_axis=0,
+                    observer_or_fake_quant_ctr=fixed_observer,
+                )
+
+                input_qspec_map = {}
+                for input in prefill_output.args:
+                    if isinstance(input, Node):
+                        input_qspec_map[input] = fixed_output_spec
+
+                prefill_output.meta[QUANT_ANNOTATION_KEY] = QuantizationAnnotation(
+                    input_qspec_map=input_qspec_map,
+                    output_qspec=fixed_output_spec,
+                    _annotated=True,
+                )
+
+
+def annotate_matmul_16a8w(gm: torch.fx.GraphModule) -> None:  # noqa: C901
     """
     This function is specific for matmul op 16a8w.
     For k, we will tag such as the below, and
@@ -142,8 +205,7 @@ def annotate_matmul_input1(node: Node):
     for node in gm.graph.nodes:
         if node.op == "call_function" and node.target == torch.ops.aten.matmul.default:
             annotate_matmul(node, quantization_config_16a8w)
-            if traverse_input1:
-                annotate_matmul_input1(node.args[1])
+            annotate_matmul_input1(node.args[1])
 
 
 def custom_annotate_llama_matmul_16a8w(gm: torch.fx.GraphModule) -> None:  # noqa: C901
diff --git a/backends/qualcomm/runtime/QnnManager.cpp b/backends/qualcomm/runtime/QnnManager.cpp
index f2650301a3..83a94fdfdf 100644
--- a/backends/qualcomm/runtime/QnnManager.cpp
+++ b/backends/qualcomm/runtime/QnnManager.cpp
@@ -154,8 +154,9 @@ Error QnnManager::RegisterMem(
     const std::shared_ptr<TensorWrapper>& tensor_wrapper) {
   SharedBuffer& shared_buffer_manager = SharedBuffer::GetSharedBufferManager();
   // Not enable shared buffer
-  if (!options_->shared_buffer())
+  if (!options_->shared_buffer()) {
     return Error::Internal;
+  }
 
   if (backend_params_ptr_->qnn_mem_manager_ptr_ == nullptr) {
     QNN_EXECUTORCH_LOG_WARN(
diff --git a/backends/qualcomm/runtime/QnnManager.h b/backends/qualcomm/runtime/QnnManager.h
index 0157ee5837..17294afbd8 100644
--- a/backends/qualcomm/runtime/QnnManager.h
+++ b/backends/qualcomm/runtime/QnnManager.h
@@ -145,7 +145,7 @@ class QnnManager {
           {Qnn_DataType_t::QNN_DATATYPE_UFIXED_POINT_8,
            executorch::aten::ScalarType::Byte},
           {Qnn_DataType_t::QNN_DATATYPE_UFIXED_POINT_16,
-           executorch::aten::ScalarType::Bits16},
+           executorch::aten::ScalarType::UInt16},
   };
 };
 } // namespace qnn
diff --git a/backends/qualcomm/runtime/backends/QnnMemManager.h b/backends/qualcomm/runtime/backends/QnnMemManager.h
index 664f717dc0..a0bdafab7b 100644
--- a/backends/qualcomm/runtime/backends/QnnMemManager.h
+++ b/backends/qualcomm/runtime/backends/QnnMemManager.h
@@ -77,7 +77,7 @@ class QnnMemManager {
            Qnn_DataType_t::QNN_DATATYPE_SFIXED_POINT_16},
           {executorch::aten::ScalarType::Byte,
            Qnn_DataType_t::QNN_DATATYPE_UFIXED_POINT_8},
-          {executorch::aten::ScalarType::Bits16,
+          {executorch::aten::ScalarType::UInt16,
            Qnn_DataType_t::QNN_DATATYPE_UFIXED_POINT_16},
   };
 };
diff --git a/backends/qualcomm/scripts/build.sh b/backends/qualcomm/scripts/build.sh
index ed77a87351..506bb92752 100755
--- a/backends/qualcomm/scripts/build.sh
+++ b/backends/qualcomm/scripts/build.sh
@@ -87,6 +87,7 @@ if [ "$BUILD_AARCH64" = true ]; then
         -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_ROOT/build/cmake/android.toolchain.cmake \
         -DANDROID_ABI='arm64-v8a' \
         -DANDROID_NATIVE_API_LEVEL=23 \
+        -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
         -DPYTHON_EXECUTABLE=$PYTHON_EXECUTABLE \
         -B$BUILD_ROOT
 
@@ -101,6 +102,7 @@ if [ "$BUILD_AARCH64" = true ]; then
         -DANDROID_ABI='arm64-v8a' \
         -DANDROID_NATIVE_API_LEVEL=23 \
         -DCMAKE_PREFIX_PATH=$CMAKE_PREFIX_PATH \
+        -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
         -DCMAKE_FIND_ROOT_PATH_MODE_PACKAGE=BOTH \
         -DPYTHON_EXECUTABLE=$PYTHON_EXECUTABLE \
         -B$EXAMPLE_ROOT
@@ -125,6 +127,7 @@ if [ "$BUILD_X86_64" = true ]; then
         -DEXECUTORCH_BUILD_QNN=ON \
         -DEXECUTORCH_BUILD_DEVTOOLS=ON \
         -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
+        -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
         -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
         -DEXECUTORCH_ENABLE_EVENT_TRACER=ON \
         -DPYTHON_EXECUTABLE=$PYTHON_EXECUTABLE \
diff --git a/backends/qualcomm/tests/models.py b/backends/qualcomm/tests/models.py
index d66aa34e5a..3ad183c2c2 100644
--- a/backends/qualcomm/tests/models.py
+++ b/backends/qualcomm/tests/models.py
@@ -16,6 +16,15 @@ def forward(self, x):
         return torch.abs(x)
 
 
+class AdaptiveAvgPool2D(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x):
+        adaptive_avg_pool = torch.nn.AdaptiveAvgPool2d((1, 1))
+        return adaptive_avg_pool(x)
+
+
 class Add(torch.nn.Module):
     def __init__(self):
         super().__init__()
@@ -685,15 +694,24 @@ def forward(self, x):
 
 
 class LayerNorm(torch.nn.Module):
-    def __init__(self):
+    def __init__(self, bias=True):
         super().__init__()
-        self.layer_norm = torch.nn.LayerNorm([768], eps=1e-6)
+        self.layer_norm = torch.nn.LayerNorm([768], eps=1e-6, bias=bias)
         self.linear = torch.nn.Linear(768, 196)
 
     def forward(self, x):
         return self.linear(self.layer_norm(x))
 
 
+class LayerNormAdd(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.layer_norm = torch.nn.LayerNorm([512], eps=1e-6, bias=False)
+
+    def forward(self, x, y):
+        return self.layer_norm(x) + y
+
+
 class LeakyReLUDefault(torch.nn.Module):
     def __init__(self):
         super().__init__()
diff --git a/backends/qualcomm/tests/test_qnn_delegate.py b/backends/qualcomm/tests/test_qnn_delegate.py
index 30ed34032f..498ee4ea68 100644
--- a/backends/qualcomm/tests/test_qnn_delegate.py
+++ b/backends/qualcomm/tests/test_qnn_delegate.py
@@ -37,8 +37,9 @@
     skip_annotation,
     update_spill_fill_size,
 )
+from executorch.examples.models.llama.llama_transformer import MOEFeedForward
 
-from executorch.examples.models.llama.llama_transformer import ModelArgs, MOEFeedForward
+from executorch.examples.models.llama.model_args import ModelArgs
 
 from executorch.examples.qualcomm.utils import setup_common_args_and_variables
 
@@ -97,6 +98,11 @@ def test_qnn_backend_abs(self):
         sample_input = (torch.randn(1, 2, 3, 4),)
         self.lower_module_and_test_output(module, sample_input)
 
+    def test_qnn_backend_adaptive_avg_pool2d(self):
+        module = AdaptiveAvgPool2D()  # noqa: F405
+        sample_input = (torch.randn(1, 512, 7, 7),)
+        self.lower_module_and_test_output(module, sample_input)
+
     def test_qnn_backend_arange(self):
         modules = [
             Arange(start=1, end=11, step=1, dtype=torch.int32),  # noqa: F405
@@ -432,9 +438,11 @@ def test_qnn_backend_interpolate_nearest_2d(self):
         self.lower_module_and_test_output(module, sample_input)
 
     def test_qnn_backend_layer_norm(self):
-        module = LayerNorm()  # noqa: F405
+        modules = [LayerNorm(), LayerNorm(bias=False)]  # noqa: F405
         sample_input = (torch.randn(196, 768),)
-        self.lower_module_and_test_output(module, sample_input)
+        for i, module in enumerate(modules):
+            with self.subTest(i=i):
+                self.lower_module_and_test_output(module, sample_input)
 
     def test_qnn_backend_leaky_relu(self):
         test_comb = [
@@ -915,6 +923,12 @@ def test_qnn_backend_abs(self):
         module = self.get_qdq_module(module, sample_input)
         self.lower_module_and_test_output(module, sample_input)
 
+    def test_qnn_backend_adaptive_avg_pool2d(self):
+        module = AdaptiveAvgPool2D()  # noqa: F405
+        sample_input = (torch.randn(1, 512, 7, 7),)
+        module = self.get_qdq_module(module, sample_input)
+        self.lower_module_and_test_output(module, sample_input)
+
     def test_qnn_backend_arange(self):
         modules = [
             Arange(start=1, end=6, step=0.5, dtype=torch.float32),  # noqa: F405
@@ -1280,10 +1294,12 @@ def test_qnn_backend_interpolate_nearest_2d(self):
         self.lower_module_and_test_output(module, sample_input)
 
     def test_qnn_backend_layer_norm(self):
-        module = LayerNorm()  # noqa: F405
+        modules = [LayerNorm(), LayerNorm(bias=False)]  # noqa: F405
         sample_input = (torch.randn(196, 768),)
-        module = self.get_qdq_module(module, sample_input)
-        self.lower_module_and_test_output(module, sample_input)
+        for i, module in enumerate(modules):
+            with self.subTest(i=i):
+                module = self.get_qdq_module(module, sample_input)
+                self.lower_module_and_test_output(module, sample_input)
 
     def test_qnn_backend_leaky_relu(self):
         test_comb = [
@@ -2675,6 +2691,42 @@ def required_envs(self, conditions=None) -> bool:
             ]
         )
 
+    def test_conv_former(self):
+        if not self.required_envs([self.image_dataset]):
+            self.skipTest("missing required envs")
+
+        cmds = [
+            "python",
+            f"{self.executorch_root}/examples/qualcomm/oss_scripts/conv_former.py",
+            "--dataset",
+            self.image_dataset,
+            "--artifact",
+            self.artifact_dir,
+            "--build_folder",
+            self.build_folder,
+            "--device",
+            self.device,
+            "--model",
+            self.model,
+            "--ip",
+            self.ip,
+            "--port",
+            str(self.port),
+        ]
+        if self.host:
+            cmds.extend(["--host", self.host])
+
+        p = subprocess.Popen(cmds, stdout=subprocess.DEVNULL)
+        with Listener((self.ip, self.port)) as listener:
+            conn = listener.accept()
+            p.communicate()
+            msg = json.loads(conn.recv())
+            if "Error" in msg:
+                self.fail(msg["Error"])
+            else:
+                self.assertGreaterEqual(msg["top_1"], 60)
+                self.assertGreaterEqual(msg["top_5"], 80)
+
     def test_dino_v2(self):
         if not self.required_envs([self.image_dataset]):
             self.skipTest("missing required envs")
@@ -3529,7 +3581,7 @@ def test_stories_single_llama(self):
 
         cmds = [
             "python",
-            f"{self.executorch_root}/examples/qualcomm/oss_scripts/llama2/llama.py",
+            f"{self.executorch_root}/examples/qualcomm/oss_scripts/llama/llama.py",
             "--artifact",
             self.artifact_dir,
             "--build_folder",
@@ -3556,6 +3608,8 @@ def test_stories_single_llama(self):
             "16a4w",
             "--temperature",
             "0",
+            "--llama_model",
+            "stories110m",
         ]
         if self.host:
             cmds.extend(["--host", self.host])
diff --git a/backends/qualcomm/utils/constants.py b/backends/qualcomm/utils/constants.py
index 1cc51690ff..4f73d331ad 100644
--- a/backends/qualcomm/utils/constants.py
+++ b/backends/qualcomm/utils/constants.py
@@ -26,8 +26,8 @@
 QCOM_SCALE_OFFSET = "scale_offset"
 QCOM_ZERO_POINT = "zero_point"
 QCOM_ZERO_POINTS = "zero_points"
-QCOM_PASS_EXPAND_BROADCAST_SHAPE = "expand_broadcast_shape"
-QCOM_PASS_SKIP_ADVANCED_REQUANT = "skip_advanced_requant"
+QCOM_PASS_ACTIVATE_KEY = "activate"
+QCOM_PASS_ARGS_KWARGS_DEFAULTS_KEY = "args_kwargs_defaults"
 
 # constants in backends/qualcomm/tests
 QCOM_ANNOTATION = "annotation"
diff --git a/backends/qualcomm/utils/utils.py b/backends/qualcomm/utils/utils.py
index a4acae9585..1bcfa3a6f6 100644
--- a/backends/qualcomm/utils/utils.py
+++ b/backends/qualcomm/utils/utils.py
@@ -3,13 +3,13 @@
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
-
+import inspect
 import operator
 import re
 import time
 import warnings
 from collections import OrderedDict
-from typing import Any, Callable, Dict, FrozenSet, List, Optional, Tuple
+from typing import Any, Callable, Dict, List, Optional, Tuple
 
 import executorch.backends.qualcomm.python.PyQnnManagerAdaptor as PyQnnManagerAdaptor
 
@@ -46,6 +46,9 @@
 from executorch.backends.qualcomm._passes.replace_index_put_input import (
     ReplaceIndexPutInput,
 )
+from executorch.backends.qualcomm._passes.utils import (
+    get_passes_dependency_for_capture_program,
+)
 
 from executorch.backends.qualcomm.builders.node_visitor import (
     QNN_QUANT_TYPE_MAP,
@@ -74,8 +77,8 @@
     option_to_flatbuffer,
 )
 from executorch.backends.qualcomm.utils.constants import (
-    QCOM_PASS_EXPAND_BROADCAST_SHAPE,
-    QCOM_PASS_SKIP_ADVANCED_REQUANT,
+    QCOM_PASS_ACTIVATE_KEY,
+    QCOM_PASS_ARGS_KWARGS_DEFAULTS_KEY,
     QCOM_QNN_COMPILE_SPEC,
     QCOM_QUANTIZED_IO,
 )
@@ -89,10 +92,12 @@
 from executorch.exir.backend.compile_spec_schema import CompileSpec
 from executorch.exir.capture import ExecutorchBackendConfig
 from executorch.exir.lowered_backend_module import LoweredBackendModule
+from executorch.exir.passes import PassManager
 from executorch.exir.program._program import _get_updated_graph_signature
-from torch._decomp import core_aten_decompositions as torch_core_aten_decompositions
+from torch._decomp import core_aten_decompositions, remove_decompositions
 from torch.export.exported_program import ExportedProgram
 from torch.fx import passes
+from torch.fx.passes.infra.pass_manager import this_before_that_pass_constraint
 from torch.fx.passes.operator_support import OperatorSupportBase
 from torch.library import Library
 
@@ -283,9 +288,10 @@ def set_spec(module, options):
 
 
 def get_decomp_table() -> Dict[torch._ops.OperatorBase, Callable]:
-    source_decompositions = torch_core_aten_decompositions()
+    source_decompositions = core_aten_decompositions()
     # The below super ops are supported by QNN
-    remove_decompositions = [
+    skip_decompositions = [
+        torch.ops.aten.adaptive_avg_pool2d.default,
         torch.ops.aten.pixel_shuffle.default,
         torch.ops.aten.pixel_unshuffle.default,
         torch.ops.aten.hardsigmoid.default,
@@ -293,39 +299,92 @@ def get_decomp_table() -> Dict[torch._ops.OperatorBase, Callable]:
         torch.ops.aten._safe_softmax.default,
     ]
 
-    for key in remove_decompositions:
-        source_decompositions.pop(key)
+    remove_decompositions(source_decompositions, skip_decompositions)
 
     return source_decompositions
 
 
+def get_capture_program_passes():
+    """
+    Defines and returns the default ordered passes for the capture program.
+    This function creates an OrderedDict containing a series of default passes.
+
+    Returns:
+        OrderedDict: An ordered dictionary containing all default passes along with their activation status and initialization parameters.
+    """
+
+    # The second value in each tuple in `default_passes_and_setting` indicates whether the corresponding pass is activated by default.
+    # If a pass is activated, it will be executed by default.
+    default_passes_and_setting = [
+        (RemoveRedundancy, True),
+        (RecomposePixelUnshuffle, True),
+        (RecomposeRmsNorm, True),
+        (ConvertToLinear, True),
+        (ConvertPReLU, True),
+        (ConvertBmmToMatmul, True),
+        (ConvertInterpolateWithUpsample2D, True),
+        (I64toI32, True),
+        (AnnotateQuantAttrs, True),
+        (AnnotateAndQuantScalar, True),
+        (AnnotateDecomposed, True),
+        (FoldQDQ, True),
+        (ExpandBroadcastTensorShape, False),
+        (LayoutTransform, True),
+        (ReplaceIndexPutInput, True),
+    ]
+
+    passes = OrderedDict()
+    for p, act in default_passes_and_setting:
+        init_signature = inspect.signature(p.__init__)
+
+        args_kwargs_defaults = {
+            k: v.default if v.default is not inspect.Parameter.empty else None
+            for k, v in init_signature.parameters.items()
+            if k != "self"
+        }
+
+        passes[p] = {
+            QCOM_PASS_ACTIVATE_KEY: act,
+            QCOM_PASS_ARGS_KWARGS_DEFAULTS_KEY: args_kwargs_defaults,
+        }
+
+    return passes
+
+
+def _topological_sort_passes(passes: OrderedDict):
+    dep_table = get_passes_dependency_for_capture_program()
+    pm = PassManager()
+    for p in passes:
+        pm.add_pass(p)
+
+    for that, these in dep_table.items():
+        for this in these:
+            pm.add_constraint(this_before_that_pass_constraint(this, that))
+
+    pm.solve_constraints()
+    sorted_passes = OrderedDict()
+    for p in pm.passes:
+        sorted_passes[p] = passes[p]
+    return sorted_passes
+
+
 def _transform(
-    edge_program: ExportedProgram, custom_pass_config: FrozenSet[str] = frozenset()
+    edge_program: ExportedProgram, passes_job: OrderedDict = None
 ) -> ExportedProgram:
     # currently ExirExportedProgram.transform does not accept
     # changes of input number which was caused by FoldQDQ
     # apply passes one by one here to avoid IR capture failure
     graph_module = edge_program.graph_module
-    RemoveRedundancy()(graph_module)
-    RecomposePixelUnshuffle()(graph_module)
-    RecomposeRmsNorm()(graph_module)
-    ConvertToLinear()(graph_module)
-    ConvertPReLU(edge_program)(graph_module)
-    ConvertBmmToMatmul()(graph_module)
-    ConvertInterpolateWithUpsample2D()(graph_module)
-    I64toI32(edge_program)(graph_module)
-    AnnotateQuantAttrs(
-        edge_program, QCOM_PASS_SKIP_ADVANCED_REQUANT in custom_pass_config
-    )(graph_module)
-    AnnotateAndQuantScalar(edge_program)(graph_module)
-    AnnotateDecomposed(edge_program)(graph_module)
-    FoldQDQ()(graph_module)
-    # this pass is not necessary for network without layout-sensitive ops
-    # enable defaultly will introduce overhead from extra view_copy nodes
-    if QCOM_PASS_EXPAND_BROADCAST_SHAPE in custom_pass_config:
-        ExpandBroadcastTensorShape()(graph_module)
-    LayoutTransform(edge_program)(graph_module)
-    ReplaceIndexPutInput(edge_program)(graph_module)
+    passes_job = passes_job if passes_job is not None else get_capture_program_passes()
+    passes_job = _topological_sort_passes(passes_job)
+    for p in passes_job:
+        if not passes_job[p][QCOM_PASS_ACTIVATE_KEY]:
+            continue
+
+        kwargs = passes_job[p][QCOM_PASS_ARGS_KWARGS_DEFAULTS_KEY]
+        if "edge_program" in kwargs:
+            kwargs["edge_program"] = edge_program
+        p(**kwargs)(graph_module)
 
     # Since QDQ nodes are stripped, update graph signature again to validate program
     edge_program._graph_signature = _get_updated_graph_signature(
@@ -339,7 +398,7 @@ def _transform(
 def capture_program(
     module: torch.nn.Module,
     inputs: Tuple[torch.Tensor],
-    custom_pass_config: FrozenSet[str] = frozenset(),
+    passes_job: OrderedDict = None,
 ) -> exir.ExirExportedProgram:
     ep = torch.export.export(module, inputs, strict=True)
     decomposed_ep = ep.run_decompositions(get_decomp_table())
@@ -350,7 +409,8 @@ def capture_program(
     core_ep = ExirExportedProgram(decomposed_ep, False)
     core_ep.transform(ConvertBinaryOpsWithScalar())
     edge_ep = core_ep.to_edge(qnn_edge_config())
-    _transform(edge_ep.exported_program, custom_pass_config)
+
+    _transform(edge_ep.exported_program, passes_job)
     return edge_ep
 
 
@@ -906,28 +966,34 @@ def generate_multi_graph_program(
 
 
 def generate_composite_llama_program(
+    llama_model: torch.nn.Module,
     graph_names: List[str],
     sample_inputs_list: List[Tuple[Any]],
     lower_module_dict: Dict[str, List[LoweredBackendModule]],
     call_delegate_node_name_dict: Dict[str, List[str]],
     call_delegate_inputs_dict: Dict[str, List[Tuple[str, int | None]]],
     outputs_dict: Dict[str, List[Tuple[str, int]]],
+    embedding_quantize: str,
     backend_config: ExecutorchBackendConfig = None,
     constant_methods: Optional[Dict[str, Any]] = None,
 ) -> ExecutorchProgramManager:
     class CompositeLlamaModule(torch.nn.Module):
         def __init__(
             self,
+            llama_model,
             lower_module_list,
             call_delegate_node_name_list,
             call_delegate_inputs_list,
             outputs_list,
+            embedding_quantize,
         ) -> None:
             super().__init__()
+            self.llama_model = llama_model
             self.lower_module_list = lower_module_list
             self.call_delegate_node_name_list = call_delegate_node_name_list
             self.call_delegate_inputs_list = call_delegate_inputs_list
             self.outputs_list = outputs_list
+            self.embedding_quantize = embedding_quantize
 
         def reorder(
             self,
@@ -960,6 +1026,13 @@ def forward(
             }
             for num, arg in enumerate(args):
                 module_input_dict[f"args_{num}"] = arg
+
+            if self.embedding_quantize:
+                hidden_states = self.llama_model.tok_embeddings(tokens)
+                module_input_dict["quantized_decomposed_embedding_4bit_dtype"] = (
+                    hidden_states
+                )
+
             for lower_module, call_delegate_node_name, call_delegate_inputs in zip(
                 self.lower_module_list,
                 self.call_delegate_node_name_list,
@@ -976,10 +1049,12 @@ def forward(
     progs_dict = {}
     for graph_name, sample_inputs in zip(graph_names, sample_inputs_list):
         composite_llama_module = CompositeLlamaModule(
+            llama_model,
             lower_module_dict[graph_name],
             call_delegate_node_name_dict[graph_name],
             call_delegate_inputs_dict[graph_name],
             outputs_dict[graph_name],
+            embedding_quantize,
         )
         prog = torch.export.export(composite_llama_module, sample_inputs)
         progs_dict[graph_name] = prog
diff --git a/backends/vulkan/_passes/TARGETS b/backends/vulkan/_passes/TARGETS
index 74048cfb6a..4e60fc7bd7 100644
--- a/backends/vulkan/_passes/TARGETS
+++ b/backends/vulkan/_passes/TARGETS
@@ -30,6 +30,19 @@ runtime.python_library(
     ]
 )
 
+runtime.python_library(
+    name = "remove_asserts",
+    srcs = ["remove_asserts.py"],
+    visibility = [
+        "//executorch/backends/...",
+    ],
+    deps = [
+        "//caffe2:torch",
+        "//executorch/exir:pass_base",
+        "//executorch/exir/dialects:lib",
+    ],
+)
+
 runtime.python_library(
     name = "remove_local_scalar_dense",
     srcs = ["remove_local_scalar_dense_ops.py"],
@@ -83,6 +96,7 @@ runtime.python_library(
     deps = [
         ":insert_prepack_nodes",
         ":int4_weight_only_quantizer",
+        ":remove_asserts",
         ":remove_local_scalar_dense",
         ":remove_redundant_ops",
         ":tag_memory_meta_pass"
diff --git a/backends/vulkan/_passes/__init__.py b/backends/vulkan/_passes/__init__.py
index 416339574b..8c29f5488f 100644
--- a/backends/vulkan/_passes/__init__.py
+++ b/backends/vulkan/_passes/__init__.py
@@ -2,6 +2,10 @@
 from executorch.backends.vulkan._passes.int4_weight_only_quantizer import (
     VkInt4WeightOnlyQuantizer,
 )
+from executorch.backends.vulkan._passes.remove_asserts import (
+    remove_asserts,
+    RemoveAssertsTransform,
+)
 from executorch.backends.vulkan._passes.remove_local_scalar_dense_ops import (
     RemoveLocalScalarDenseOpsTransform,
 )
@@ -13,6 +17,8 @@
 __all__ = [
     "insert_prepack_nodes",
     "VkInt4WeightOnlyQuantizer",
+    "remove_asserts",
+    "RemoveAssertsTransform",
     "RemoveLocalScalarDenseOpsTransform",
     "RemoveRedundantOpsTransform",
     "TagMemoryMetaPass",
diff --git a/backends/vulkan/_passes/insert_prepack_nodes.py b/backends/vulkan/_passes/insert_prepack_nodes.py
index 7876806d6d..bf1fc28ba5 100644
--- a/backends/vulkan/_passes/insert_prepack_nodes.py
+++ b/backends/vulkan/_passes/insert_prepack_nodes.py
@@ -60,6 +60,12 @@ def prepack_not_required(node: torch.fx.Node) -> bool:
             )
             # This pass assumes that the SpecPropPass() has already been applied
             assert "spec" in node.meta
+            # Mutable buffers will not be marked as constant, but it might as well be
+            # for the purposes of memory planning. Mark it as a constant tensor so that
+            # it is handled correctly by the memory planning pass.
+            if not node.meta["spec"].const:
+                assert is_param_node(program, node)
+                node.meta["spec"].const = True
             # Validate that the original node is marked as a constant. Constant tensors
             # do not participate in memory planning.
             assert node.meta["spec"].const
@@ -68,7 +74,9 @@ def prepack_not_required(node: torch.fx.Node) -> bool:
             # Set the mem_obj_id to -1 to indicate that this node requires a dedicated
             # memory object.
             prepack_node.meta["spec"].mem_obj_id = -1
-            node.replace_all_uses_with(prepack_node, lambda x, y=prepack_node: x != y)
+            node.replace_all_uses_with(
+                prepack_node, lambda x, y=prepack_node: (x != y and x.op != "output")
+            )
 
     program.graph.eliminate_dead_code()
     return program
diff --git a/backends/vulkan/_passes/remove_asserts.py b/backends/vulkan/_passes/remove_asserts.py
new file mode 100644
index 0000000000..835f2ec141
--- /dev/null
+++ b/backends/vulkan/_passes/remove_asserts.py
@@ -0,0 +1,52 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# pyre-strict
+
+from typing import Set, Union
+
+import torch
+
+from executorch.exir.dialects.edge._ops import EdgeOpOverload
+from executorch.exir.pass_base import ExportPass, PassResult
+from executorch.exir.program._program import _get_updated_graph_signature
+
+from torch.export.exported_program import ExportedProgram
+
+OpType = Union[str, torch._ops.OpOverload, EdgeOpOverload]
+
+
+class RemoveAssertsTransform(ExportPass):
+    """
+    Remove operators which perform assertions. These are not possible to execute in
+    Vulkan since GLSL shaders cannot abort execution at runtime. Therefore, remove these
+    operators.
+    """
+
+    assert_ops: Set[OpType] = {
+        torch.ops.aten._assert_scalar.default,
+        torch.ops.aten.sym_constrain_range_for_size.default,
+    }
+
+    def call(self, graph_module: torch.fx.GraphModule) -> PassResult:
+        for node in graph_module.graph.nodes:
+            if node.target in self.assert_ops:
+                graph_module.graph.erase_node(node)
+
+        graph_module.graph.eliminate_dead_code()
+        graph_module.recompile()
+        return PassResult(graph_module, True)
+
+
+def remove_asserts(edge_program: ExportedProgram) -> ExportedProgram:
+    graph_module = edge_program.graph_module
+    RemoveAssertsTransform()(graph_module)
+
+    edge_program._graph_signature = _get_updated_graph_signature(
+        edge_program.graph_signature, graph_module
+    )
+    edge_program._validate()
+    return edge_program
diff --git a/backends/vulkan/_passes/tag_memory_meta_pass.py b/backends/vulkan/_passes/tag_memory_meta_pass.py
index 1d08817e26..f2f54404ca 100644
--- a/backends/vulkan/_passes/tag_memory_meta_pass.py
+++ b/backends/vulkan/_passes/tag_memory_meta_pass.py
@@ -23,9 +23,6 @@
 
 from executorch.exir.pass_base import ExportPass, PassResult
 
-from torch.fx.passes.tools_common import NodeList
-from torch.fx.passes.utils.fuser_utils import topo_sort
-
 logger: logging.Logger = logging.getLogger("")
 logger.setLevel(logging.INFO)
 
@@ -220,9 +217,7 @@ def should_delay_annotation(self, node: torch.fx.Node) -> bool:
 
     # noqa
     def call(self, graph_module: torch.fx.GraphModule) -> PassResult:
-        sorted_nodes: NodeList = topo_sort(list(graph_module.graph.nodes))
-
-        for node in sorted_nodes:
+        for node in graph_module.graph.nodes:
             if not self.should_annotate(node) or self.should_delay_annotation(node):
                 continue
 
diff --git a/backends/vulkan/op_registry.py b/backends/vulkan/op_registry.py
index d70cf93b88..25cf74dc8f 100644
--- a/backends/vulkan/op_registry.py
+++ b/backends/vulkan/op_registry.py
@@ -478,7 +478,7 @@ def register_convolution_op(features: OpFeatures):
 
 
 @update_features("llama::sdpa_with_kv_cache")
-def register_sdpa_op(features: OpFeatures):
+def register_sdpa_with_kv_cache_op(features: OpFeatures):
     features.texture_impl = TextureImplFeatures(
         valid_packed_dims={PackedDim.WIDTH},
     )
@@ -489,6 +489,16 @@ def register_sdpa_op(features: OpFeatures):
     return features
 
 
+@update_features(["llama::update_cache", "llama::custom_sdpa"])
+def register_sdpa_ops(features: OpFeatures):
+    features.resize_fn = False
+    features.buffer_impl = False
+    features.texture_impl = TextureImplFeatures(
+        valid_packed_dims={PackedDim.WIDTH},
+    )
+    return features
+
+
 @update_features(exir_ops.edge.et_vk.apply_rotary_emb.default)
 def register_rotary_emb_op(features: OpFeatures):
     features.texture_impl = TextureImplFeatures(
diff --git a/backends/vulkan/partitioner/vulkan_partitioner.py b/backends/vulkan/partitioner/vulkan_partitioner.py
index 3c31e0316a..6ff3fa8d70 100644
--- a/backends/vulkan/partitioner/vulkan_partitioner.py
+++ b/backends/vulkan/partitioner/vulkan_partitioner.py
@@ -250,11 +250,19 @@ def _is_node_supported(self, node: torch.fx.Node) -> bool:
             self.log_skip(node, "local scalar dense of incompatible op node")
             return False
 
+        features = None
         if target not in vulkan_supported_ops:
-            self.log_skip(node, "no operator implementation")
-            return False
+            # For some ops, i.e. custom ops the name is registered instead of the
+            # OpOverload object.
+            if not isinstance(target, str) and target.name() in vulkan_supported_ops:
+                features = vulkan_supported_ops[target.name()]
+            else:
+                self.log_skip(node, "no operator implementation")
+                return False
+        else:
+            features = vulkan_supported_ops[target]
 
-        features = vulkan_supported_ops[target]
+        assert features is not None
 
         if not features.check_node_fn(node):
             self.log_skip(node, "op args not supported")
diff --git a/backends/vulkan/runtime/VulkanBackend.cpp b/backends/vulkan/runtime/VulkanBackend.cpp
index 8bff63d0e8..3d249aab4a 100644
--- a/backends/vulkan/runtime/VulkanBackend.cpp
+++ b/backends/vulkan/runtime/VulkanBackend.cpp
@@ -417,10 +417,10 @@ bool maybe_update_scalar_tensor(
     executorch::aten::Tensor& scalar_tensor_src) {
   const int32_t cur_val = graph->read_symint(ref);
   int32_t scalar_tensor_val = 0;
-  exec_aten::ScalarType dtype = scalar_tensor_src.scalar_type();
-  if (dtype == exec_aten::ScalarType::Int) {
+  executorch::aten::ScalarType dtype = scalar_tensor_src.scalar_type();
+  if (dtype == executorch::aten::ScalarType::Int) {
     scalar_tensor_val = *scalar_tensor_src.const_data_ptr<int32_t>();
-  } else if (dtype == exec_aten::ScalarType::Long) {
+  } else if (dtype == executorch::aten::ScalarType::Long) {
     scalar_tensor_val = int32_t(*scalar_tensor_src.const_data_ptr<int64_t>());
   }
   bool was_updated = false;
diff --git a/backends/vulkan/runtime/graph/ops/glsl/activations.h b/backends/vulkan/runtime/graph/ops/glsl/activations.h
index 94c9e1274d..2ba0ccc467 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/activations.h
+++ b/backends/vulkan/runtime/graph/ops/glsl/activations.h
@@ -42,3 +42,15 @@ vec4 hardsigmoid(vec4 tex) {
       hardsigmoid(tex.z),
       hardsigmoid(tex.w));
 }
+
+float leaky_relu(float x, float negative_slope) {
+  return x * (float(x > 0.0) + negative_slope * float(x <= 0.0));
+}
+
+vec4 leaky_relu(vec4 tex, float negative_slope) {
+  return vec4(
+      leaky_relu(tex.x, negative_slope),
+      leaky_relu(tex.y, negative_slope),
+      leaky_relu(tex.z, negative_slope),
+      leaky_relu(tex.w, negative_slope));
+}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_output_tile.glsl b/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_output_tile.glsl
index c05c7e4450..3265a97398 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_output_tile.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_output_tile.glsl
@@ -14,12 +14,12 @@
 
 #define TILE_SIZE ${TILE_SIZE}
 
-#define STRIDE_EQ_DILATION ${STRIDE_EQ_DILATION}
-
 #define BATCH_SIZE_X ${BATCH_SIZE_X}
 
 #define BATCH_SIZE_Y ${BATCH_SIZE_Y}
 
+#define LOCAL_WG_SIZE 64
+
 #define op(X, A, B) ${OPERATOR}
 
 #include "indexing_utils.h"
@@ -30,14 +30,28 @@ ${layout_declare_tensor(0, "w", "t_out", DTYPE, "texture3d")}
 ${layout_declare_tensor(1, "r", "t_in", DTYPE, "texture3d")}
 ${layout_declare_tensor(2, "r", "t_kernel", DTYPE, "texture2d")}
 ${layout_declare_tensor(3, "r", "t_bias", DTYPE, "texture2d")}
-${layout_declare_ubo(4, "ivec3", "out_limits")}
-${layout_declare_ubo(5, "ivec4", "in_sizes")}
-${layout_declare_ubo(6, "ivec2", "kernel_size", "ivec2", "stride", "ivec2", "padding", "ivec2", "dilation")}
-${layout_declare_ubo(7, "ivec2", "overlay_region", "int", "in_group_size")}
-${layout_declare_ubo(8, "float", "out_min", "float", "out_max")}
+
+layout(push_constant) uniform restrict Block {
+  ivec4 out_limits;
+  ivec4 in_sizes;
+  ivec2 kernel_size;
+  ivec2 stride;
+  ivec2 padding;
+  ivec2 dilation;
+  ivec2 overlay_region;
+  int in_group_size;
+  int dummy_padding;
+  float out_min;
+  float out_max;
+};
 
 layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 
+// For performance improvement, reduce register usage by caching positions in shared memory.
+// Offset index by 1 every 16 points to avoid bank access conflict.
+#define offset_pos_index(index) (index + ((index) >> 4))
+shared ivec3 pos_shared[offset_pos_index(LOCAL_WG_SIZE)];
+
 /*
  * Computes a depthwise convolution. Each shader invocation calculates the
  * output at a single output location.
@@ -63,6 +77,8 @@ void main() {
     return;
   }
 
+  pos_shared[offset_pos_index(gl_LocalInvocationIndex)] = pos;
+
   // Compute the index of the top-left element of the overlay region. Negative
   // indices indicate that the top-left element is in a region added by padding.
   const ivec2 ipos = pos.xy * stride - padding;
@@ -109,18 +125,19 @@ void main() {
       for (int j = 0; j < TILE_SIZE; j++, kx++) {
         prev_kernel_line[j] = texelFetch(t_kernel, ivec2(kx, pos.z), 0);
         for (int s = 0; s < BATCH_SIZE_X; s++) {
-            sum[0][s] = fma(in_texels[j + s], prev_kernel_line[j], sum[0][s]);
+          sum[0][s] = fma(in_texels[j + s], prev_kernel_line[j], sum[0][s]);
         }
       }
     }
   }
 
+  const ivec3 out_pos = pos_shared[offset_pos_index(gl_LocalInvocationIndex)];
   for (int y = 0; y < BATCH_SIZE_Y; y++) {
     for (int x = 0; x < BATCH_SIZE_X; x++) {
-      if (any(greaterThanEqual(ivec3(pos.x + x, pos.y + y, pos.z), out_limits))) {
+      if (any(greaterThanEqual(ivec3(out_pos.x + x, out_pos.y + y, out_pos.z), out_limits.xyz))) {
         continue;
       }
-      imageStore(t_out, ivec3(pos.x + x, pos.y + y, pos.z), op(sum[y][x], out_min, out_max));
+      imageStore(t_out, ivec3(out_pos.x + x, out_pos.y + y, out_pos.z), op(sum[y][x], out_min, out_max));
     }
   }
 }
diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_output_tile.yaml b/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_output_tile.yaml
index d3672f5ec2..9cf6c22c6c 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_output_tile.yaml
+++ b/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_output_tile.yaml
@@ -12,7 +12,6 @@ conv2d_dw_output_tile:
     TILE_SIZE: 3
     BATCH_SIZE_X: 4
     BATCH_SIZE_Y: 2
-    STRIDE_EQ_DILATION: 0
   generate_variant_forall:
     DTYPE:
       - VALUE: half
@@ -26,15 +25,3 @@ conv2d_dw_output_tile:
     - NAME: conv2d_dw_output_tile_5x5_clamp
       OPERATOR: clamp(X, A, B)
       TILE_SIZE: 5
-    - NAME: conv2d_dw_sed_output_tile_3x3
-      STRIDE_EQ_DILATION: 1
-    - NAME: conv2d_dw_sed_output_tile_3x3_clamp
-      OPERATOR: clamp(X, A, B)
-      STRIDE_EQ_DILATION: 1
-    - NAME: conv2d_dw_sed_output_tile_5x5
-      TILE_SIZE: 5
-      STRIDE_EQ_DILATION: 1
-    - NAME: conv2d_dw_sed_output_tile_5x5_clamp
-      OPERATOR: clamp(X, A, B)
-      TILE_SIZE: 5
-      STRIDE_EQ_DILATION: 1
diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_sned_output_tile.glsl b/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_sned_output_tile.glsl
index bb70ee1aab..ceadc35779 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_sned_output_tile.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_sned_output_tile.glsl
@@ -24,11 +24,20 @@ ${layout_declare_tensor(0, "w", "t_out", DTYPE, "texture3d")}
 ${layout_declare_tensor(1, "r", "t_in", DTYPE, "texture3d")}
 ${layout_declare_tensor(2, "r", "t_kernel", DTYPE, "texture2d")}
 ${layout_declare_tensor(3, "r", "t_bias", DTYPE, "texture2d")}
-${layout_declare_ubo(4, "ivec3", "out_limits")}
-${layout_declare_ubo(5, "ivec4", "in_sizes")}
-${layout_declare_ubo(6, "ivec2", "kernel_size", "ivec2", "stride", "ivec2", "padding", "ivec2", "dilation")}
-${layout_declare_ubo(7, "ivec2", "overlay_region", "int", "in_group_size")}
-${layout_declare_ubo(8, "float", "out_min", "float", "out_max")}
+
+layout(push_constant) uniform restrict Block {
+  ivec4 out_limits;
+  ivec4 in_sizes;
+  ivec2 kernel_size;
+  ivec2 stride;
+  ivec2 padding;
+  ivec2 dilation;
+  ivec2 overlay_region;
+  int in_group_size;
+  int dummy_padding;
+  float out_min;
+  float out_max;
+};
 
 layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 
diff --git a/backends/vulkan/runtime/graph/ops/glsl/unary_op.yaml b/backends/vulkan/runtime/graph/ops/glsl/unary_op.yaml
index 77a334a05e..6757d2a6d4 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/unary_op.yaml
+++ b/backends/vulkan/runtime/graph/ops/glsl/unary_op.yaml
@@ -42,3 +42,5 @@ unary_op:
       OPERATOR: hardswish(X)
     - NAME: hardsigmoid
       OPERATOR: hardsigmoid(X)
+    - NAME: leaky_relu
+      OPERATOR: leaky_relu(X, A)
diff --git a/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp b/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp
index 3c367f334d..71b7ce80cc 100644
--- a/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp
@@ -407,7 +407,9 @@ void add_conv2d_node(
     wg_size = {wg_size[0] * wg_size[1] * wg_size[2], 1, 1};
   }
 
-  if (method == Conv2dMethod::Pointwise) {
+  vkapi::ParamsBindList param_buffers;
+  std::vector<PushConstantDataInfo> push_constants;
+  if (method == Conv2dMethod::Pointwise || method == Conv2dMethod::Depthwise) {
     const utils::ivec4 kernel_param_size_stride = {
         kernel_params.kernel_size[0],
         kernel_params.kernel_size[1],
@@ -420,55 +422,43 @@ void add_conv2d_node(
         kernel_params.dilation[0],
         kernel_params.dilation[1]};
 
-    graph.execute_nodes().emplace_back(new DispatchNode(
-        graph,
-        shader,
-        wg_size,
-        graph.create_local_wg_size(wg_size),
-        // Inputs and Outputs
-        {{out, vkapi::MemoryAccessType::WRITE},
-         {{in, arg_weight, arg_bias}, vkapi::MemoryAccessType::READ}},
-        // Shader params buffers
-        {},
-        // Specialization Constants
-        {},
-        // Resizing Logic
-        resize_conv2d_node,
-        {weight_data, stride, padding, dilation, transposed, output_padding},
-        {
-            graph.logical_limits_pc_of(out),
-            graph.sizes_pc_of(in),
-            PushConstantDataInfo(
-                &kernel_param_size_stride, sizeof(kernel_param_size_stride)),
-            PushConstantDataInfo(
-                &kernel_param_pad_dial, sizeof(kernel_param_pad_dial)),
-            PushConstantDataInfo(
-                &extra_params, sizeof(extra_params), sizeof(utils::ivec4)),
-            PushConstantDataInfo(&out_params, sizeof(out_params)),
-        }));
+    push_constants = {
+        graph.logical_limits_pc_of(out),
+        graph.sizes_pc_of(in),
+        PushConstantDataInfo(
+            &kernel_param_size_stride, sizeof(kernel_param_size_stride)),
+        PushConstantDataInfo(
+            &kernel_param_pad_dial, sizeof(kernel_param_pad_dial)),
+        PushConstantDataInfo(
+            &extra_params, sizeof(extra_params), sizeof(utils::ivec4)),
+        PushConstantDataInfo(&out_params, sizeof(out_params)),
+    };
   } else {
-    graph.execute_nodes().emplace_back(new DispatchNode(
-        graph,
-        shader,
-        wg_size,
-        graph.create_local_wg_size(wg_size),
-        // Inputs and Outputs
-        {{out, vkapi::MemoryAccessType::WRITE},
-         {{in, arg_weight, arg_bias}, vkapi::MemoryAccessType::READ}},
-        // Shader params buffers
-        {
-            t_out->logical_limits_ubo(),
-            t_in->sizes_ubo(),
-            graph.create_params_buffer(kernel_params),
-            graph.create_params_buffer(extra_params),
-            graph.create_params_buffer(out_params),
-        },
-        // Specialization Constants
-        {},
-        // Resizing Logic
-        resize_conv2d_node,
-        {weight_data, stride, padding, dilation, transposed, output_padding}));
+    param_buffers = {
+        t_out->logical_limits_ubo(),
+        t_in->sizes_ubo(),
+        graph.create_params_buffer(kernel_params),
+        graph.create_params_buffer(extra_params),
+        graph.create_params_buffer(out_params),
+    };
   }
+
+  graph.execute_nodes().emplace_back(new DispatchNode(
+      graph,
+      shader,
+      wg_size,
+      graph.create_local_wg_size(wg_size),
+      // Inputs and Outputs
+      {{out, vkapi::MemoryAccessType::WRITE},
+       {{in, arg_weight, arg_bias}, vkapi::MemoryAccessType::READ}},
+      // Shader params buffers
+      param_buffers,
+      // Specialization Constants
+      {},
+      // Resizing Logic
+      resize_conv2d_node,
+      {weight_data, stride, padding, dilation, transposed, output_padding},
+      push_constants));
 }
 
 void add_conv1d_node(
diff --git a/backends/vulkan/runtime/graph/ops/impl/QuantizedLinear.cpp b/backends/vulkan/runtime/graph/ops/impl/QuantizedLinear.cpp
index a78ac0519c..1042c23bcb 100644
--- a/backends/vulkan/runtime/graph/ops/impl/QuantizedLinear.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/QuantizedLinear.cpp
@@ -73,13 +73,18 @@ void add_q_8w_linear_node(
   auto viewFn = VK_GET_OP_FN("aten.view_copy.default");
   ValueRef mat1_W_packed = mat1;
   ValueRef out_W_packed = out;
+  // Create temporary tensors to store the width packed versions of mat1 and out
+  TmpTensor mat1_tmp(
+      &graph, graph.sizes_of(mat1), graph.dtype_of(mat1), utils::kWidthPacked);
+  TmpTensor out_tmp(
+      &graph, graph.sizes_of(out), graph.dtype_of(out), utils::kWidthPacked);
   if (!graph.is_buffer_storage(out) &&
       graph.packed_dim_of(mat1) != WHCN::kWidthDim) {
     // Ensure mat1 is width packed
-    mat1_W_packed = graph.add_tensor_like(mat1, utils::kWidthPacked);
+    mat1_W_packed = mat1_tmp;
     viewFn(graph, {mat1, graph.add_none(), mat1_W_packed});
     // Ensure out is packed correctly
-    out_W_packed = graph.add_tensor_like(out, utils::kWidthPacked);
+    out_W_packed = out_tmp;
   }
   ValueRef q_mat2 = prepack_standard(
       graph, q_mat2_data, graph.storage_type_of(out), utils::kWidthPacked);
diff --git a/backends/vulkan/runtime/graph/ops/impl/SDPA.cpp b/backends/vulkan/runtime/graph/ops/impl/SDPA.cpp
index 2c46201351..6dcf2fc4f4 100644
--- a/backends/vulkan/runtime/graph/ops/impl/SDPA.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/SDPA.cpp
@@ -176,17 +176,32 @@ void resize_sdpa_out(
   graph->get_tensor(out)->virtual_resize(graph->sizes_of(q_projected));
 }
 
-void sdpa_with_kv_cache_impl(
-    ComputeGraph& graph,
-    const std::vector<ValueRef>& args) {
+void update_cache_impl(ComputeGraph& graph, const std::vector<ValueRef>& args) {
+  int arg_idx = 0;
+  const ValueRef value = args[arg_idx++];
+  const ValueRef cache = args[arg_idx++];
+  const ValueRef input_pos_symint = args[arg_idx++];
+  const ValueRef out = args[arg_idx++];
+
+  // Unused variables
+  (void)out;
+
+  VK_CHECK_COND(graph.size_at<int32_t>(-4, value) == 1);
+  VK_CHECK_COND(graph.size_at<int32_t>(-4, cache) == 1);
+  VK_CHECK_COND(
+      graph.size_at<int32_t>(-1, value) == graph.size_at<int32_t>(-1, cache));
+  VK_CHECK_COND(
+      graph.size_at<int32_t>(-2, value) == graph.size_at<int32_t>(-2, cache));
+
+  add_kv_cache_update_node(graph, input_pos_symint, value, cache);
+}
+
+void sdpa_impl(ComputeGraph& graph, const std::vector<ValueRef>& args) {
   int arg_idx = 0;
   const ValueRef q_projected = args[arg_idx++];
-  const ValueRef k_projected = args[arg_idx++];
-  const ValueRef v_projected = args[arg_idx++];
-  const ValueRef k_cache_data = args[arg_idx++];
-  const ValueRef v_cache_data = args[arg_idx++];
+  const ValueRef k_cache = args[arg_idx++];
+  const ValueRef v_cache = args[arg_idx++];
   const ValueRef input_pos_symint = args[arg_idx++];
-  const ValueRef sequence_len = args[arg_idx++];
   const ValueRef attn_mask = args[arg_idx++];
   const ValueRef dropout_p = args[arg_idx++];
   const ValueRef is_causal = args[arg_idx++];
@@ -195,23 +210,20 @@ void sdpa_with_kv_cache_impl(
   // Output tensors
   const ValueRef out = args[arg_idx++];
 
-  // Unused variables
-  (void)sequence_len;
-
   // Batches must be 1
   VK_CHECK_COND(graph.size_at<int32_t>(-4, q_projected) == 1);
-  VK_CHECK_COND(graph.size_at<int32_t>(-4, k_projected) == 1);
-  VK_CHECK_COND(graph.size_at<int32_t>(-4, v_projected) == 1);
+  VK_CHECK_COND(graph.size_at<int32_t>(-4, k_cache) == 1);
+  VK_CHECK_COND(graph.size_at<int32_t>(-4, v_cache) == 1);
   // k and v projected must have the same shape
-  VK_CHECK_COND(graph.sizes_of(k_projected) == graph.sizes_of(v_projected));
+  VK_CHECK_COND(graph.sizes_of(k_cache) == graph.sizes_of(v_cache));
   // head dim must match between tensors
   VK_CHECK_COND(
       graph.size_at<int32_t>(-1, q_projected) ==
-      graph.size_at<int32_t>(-1, k_projected));
+      graph.size_at<int32_t>(-1, k_cache));
   // All tensors must have the packed dim be the width (head) dimension
   VK_CHECK_COND(graph.packed_dim_of(q_projected) == WHCN::kWidthDim);
-  VK_CHECK_COND(graph.packed_dim_of(k_projected) == WHCN::kWidthDim);
-  VK_CHECK_COND(graph.packed_dim_of(v_projected) == WHCN::kWidthDim);
+  VK_CHECK_COND(graph.packed_dim_of(k_cache) == WHCN::kWidthDim);
+  VK_CHECK_COND(graph.packed_dim_of(v_cache) == WHCN::kWidthDim);
   // Some variables are not supported yet
   VK_CHECK_COND(
       graph.val_is_none(dropout_p) ||
@@ -222,16 +234,8 @@ void sdpa_with_kv_cache_impl(
       graph.val_is_none(is_causal) || graph.extract_scalar<bool>(is_causal));
   VK_CHECK_COND(graph.val_is_none(attn_mask));
 
-  const ValueRef k_cache =
-      prepack_standard_like(graph, k_cache_data, q_projected);
-  const ValueRef v_cache =
-      prepack_standard_like(graph, v_cache_data, q_projected);
-
   const int32_t max_seq_len = graph.size_at<int32_t>(1, k_cache);
 
-  add_kv_cache_update_node(graph, input_pos_symint, k_projected, k_cache);
-  add_kv_cache_update_node(graph, input_pos_symint, v_projected, v_cache);
-
   // Slice caches from 0 to input_pos + sequence_len
   const ValueRef k_cache_sliced = graph.add_tensor_view(k_cache);
   const ValueRef v_cache_sliced = graph.add_tensor_view(v_cache);
@@ -257,7 +261,7 @@ void sdpa_with_kv_cache_impl(
 
   // Repeat interleave
   const int64_t num_heads = graph.size_at<int64_t>(2, q_projected);
-  const int64_t num_kv_heads = graph.size_at<int64_t>(2, k_projected);
+  const int64_t num_kv_heads = graph.size_at<int64_t>(2, k_cache);
 
   const ValueRef num_repeats =
       graph.add_scalar<int64_t>(num_heads / num_kv_heads);
@@ -331,8 +335,52 @@ void sdpa_with_kv_cache_impl(
       new ExecuteNode(resize_sdpa_out, {q_projected, out}));
 }
 
+void sdpa_with_kv_cache_impl(
+    ComputeGraph& graph,
+    const std::vector<ValueRef>& args) {
+  int arg_idx = 0;
+  const ValueRef q_projected = args[arg_idx++];
+  const ValueRef k_projected = args[arg_idx++];
+  const ValueRef v_projected = args[arg_idx++];
+  const ValueRef k_cache_data = args[arg_idx++];
+  const ValueRef v_cache_data = args[arg_idx++];
+  const ValueRef input_pos_symint = args[arg_idx++];
+  const ValueRef sequence_len = args[arg_idx++];
+  const ValueRef attn_mask = args[arg_idx++];
+  const ValueRef dropout_p = args[arg_idx++];
+  const ValueRef is_causal = args[arg_idx++];
+  const ValueRef scale = args[arg_idx++];
+
+  // Output tensors
+  const ValueRef out = args[arg_idx++];
+
+  (void)sequence_len;
+
+  const ValueRef k_cache =
+      prepack_standard_like(graph, k_cache_data, q_projected);
+  const ValueRef v_cache =
+      prepack_standard_like(graph, v_cache_data, q_projected);
+
+  update_cache_impl(graph, {k_projected, k_cache, input_pos_symint, -1});
+  update_cache_impl(graph, {v_projected, v_cache, input_pos_symint, -1});
+
+  sdpa_impl(
+      graph,
+      {q_projected,
+       k_cache,
+       v_cache,
+       input_pos_symint,
+       attn_mask,
+       dropout_p,
+       is_causal,
+       scale,
+       out});
+}
+
 REGISTER_OPERATORS {
   VK_REGISTER_OP(sdpa_with_kv_cache.default, sdpa_with_kv_cache_impl);
+  VK_REGISTER_OP(update_cache.default, update_cache_impl);
+  VK_REGISTER_OP(llama.custom_sdpa.default, sdpa_impl);
 }
 
 } // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ops/impl/UnaryOp.cpp b/backends/vulkan/runtime/graph/ops/impl/UnaryOp.cpp
index 62922e8d9e..4bf73fad5a 100644
--- a/backends/vulkan/runtime/graph/ops/impl/UnaryOp.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/UnaryOp.cpp
@@ -114,6 +114,17 @@ float get_val_or_inf(ComputeGraph& graph, const ValueRef& val, bool max) {
         "hardshrink");                                                   \
   }
 
+#define DEFINE_LEAKY_RELU_FN(op_name)                                    \
+  void op_name(ComputeGraph& graph, const std::vector<ValueRef>& args) { \
+    return add_unary_op_node(                                            \
+        graph,                                                           \
+        args[0],                                                         \
+        get_val_or_inf(graph, args[1], /*neg slope*/ false),             \
+        kDummyFloat,                                                     \
+        args[2],                                                         \
+        "leaky_relu");                                                   \
+  }
+
 void gelu(ComputeGraph& graph, const std::vector<ValueRef>& args) {
   // args[1] is the `approximate` string
   // https://fburl.com/code/9omngmyo
@@ -137,6 +148,7 @@ DEFINE_RELU_FN(relu);
 DEFINE_HARDSHRINK_FN(hardshrink);
 DEFINE_ACTIVATION_FN(hardswish);
 DEFINE_ACTIVATION_FN(hardsigmoid);
+DEFINE_LEAKY_RELU_FN(leaky_relu);
 
 REGISTER_OPERATORS {
   VK_REGISTER_OP(aten.abs.default, abs);
@@ -155,6 +167,7 @@ REGISTER_OPERATORS {
   VK_REGISTER_OP(aten.hardshrink.default, hardshrink);
   VK_REGISTER_OP(aten.hardswish.default, hardswish);
   VK_REGISTER_OP(aten.hardsigmoid.default, hardsigmoid);
+  VK_REGISTER_OP(aten.leaky_relu.default, leaky_relu);
 }
 
 } // namespace vkcompute
diff --git a/backends/vulkan/test/op_tests/cases.py b/backends/vulkan/test/op_tests/cases.py
index 9cec4891c1..2130573c0c 100644
--- a/backends/vulkan/test/op_tests/cases.py
+++ b/backends/vulkan/test/op_tests/cases.py
@@ -1072,6 +1072,7 @@ def get_reduce_op_inputs():
         "aten.cos.default",
         "aten.hardswish.default",
         "aten.hardsigmoid.default",
+        "aten.leaky_relu.default",
     ]
 )
 def get_unary_ops_inputs():
diff --git a/backends/xnnpack/CMakeLists.txt b/backends/xnnpack/CMakeLists.txt
index ed8cf8d8e1..a21ef4f668 100644
--- a/backends/xnnpack/CMakeLists.txt
+++ b/backends/xnnpack/CMakeLists.txt
@@ -1,4 +1,5 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
+# Copyright 2024-2025 Arm Limited and/or its affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the BSD-style license found in the
@@ -128,8 +129,17 @@ if(NOT CMAKE_TOOLCHAIN_FILE MATCHES ".*(iOS|ios\.toolchain)\.cmake$")
   #
   list(TRANSFORM _xnn_executor_runner__srcs PREPEND "${EXECUTORCH_ROOT}/")
   add_executable(xnn_executor_runner ${_xnn_executor_runner__srcs})
+
+  if(EXECUTORCH_ENABLE_EVENT_TRACER)
+    if(EXECUTORCH_BUILD_DEVTOOLS)
+      list(APPEND xnn_executor_runner_libs etdump)
+    else()
+      message(SEND_ERROR "Use of 'EXECUTORCH_ENABLE_EVENT_TRACER' requires 'EXECUTORCH_BUILD_DEVTOOLS' to be enabled.")
+    endif()
+  endif()
+
   target_link_libraries(
-    xnn_executor_runner xnnpack_backend gflags portable_ops_lib
+    xnn_executor_runner gflags portable_ops_lib ${xnn_executor_runner_libs}
   )
   target_compile_options(xnn_executor_runner PUBLIC ${_common_compile_options})
 endif()
diff --git a/backends/xnnpack/test/ops/test_add.py b/backends/xnnpack/test/ops/test_add.py
index 784a9d3bbf..29a87df130 100644
--- a/backends/xnnpack/test/ops/test_add.py
+++ b/backends/xnnpack/test/ops/test_add.py
@@ -7,7 +7,7 @@
 import unittest
 
 import torch
-from executorch.backends.xnnpack.test.tester import Tester
+from executorch.backends.xnnpack.test.tester import Quantize, Tester
 
 
 class TestAdd(unittest.TestCase):
@@ -136,9 +136,12 @@ def test_qs8_add2(self):
 
     def test_qs8_add3(self):
         inputs = (torch.randn(1, 1, 4, 4), torch.randn(1, 1, 4, 1))
+        calibration_samples = [
+            (torch.randn(1, 1, 4, 4), torch.randn(1, 1, 4, 1)) for _ in range(100)
+        ]
         (
             Tester(self.Add(), inputs)
-            .quantize()
+            .quantize(Quantize(calibration_samples=calibration_samples))
             .export()
             .check_count({"torch.ops.aten.add.Tensor": 4})
             .check(["torch.ops.quantized_decomposed"])
@@ -152,7 +155,7 @@ def test_qs8_add3(self):
             )
             .to_executorch()
             .serialize()
-            .run_method_and_compare_outputs()
+            .run_method_and_compare_outputs(num_runs=10, atol=0.02, rtol=0.02)
         )
 
     class AddRelu(torch.nn.Module):
diff --git a/backends/xnnpack/test/ops/test_conv1d.py b/backends/xnnpack/test/ops/test_conv1d.py
index 833ad69da6..b4c8c41492 100644
--- a/backends/xnnpack/test/ops/test_conv1d.py
+++ b/backends/xnnpack/test/ops/test_conv1d.py
@@ -13,7 +13,7 @@
 from executorch.backends.xnnpack.partition.xnnpack_partitioner import XnnpackPartitioner
 from executorch.backends.xnnpack.test.test_xnnpack_utils import randomize_bn
 
-from executorch.backends.xnnpack.test.tester import RunPasses, Tester
+from executorch.backends.xnnpack.test.tester import Quantize, RunPasses, Tester
 from executorch.backends.xnnpack.test.tester.tester import ToEdgeTransformAndLower
 from executorch.exir.passes.constant_prop_pass import constant_prop_pass
 
@@ -98,9 +98,17 @@ def _test_conv1d(
         stage=None,
         skip_to_executorch=False,
     ):
+        calibration_samples = (
+            [tuple(torch.randn_like(inputs[i]) for i in range(len(inputs)))]
+            if quantized
+            else None
+        )
+
         tester = (
             (
-                Tester(module, inputs, dynamic_shape).quantize()
+                Tester(module, inputs, dynamic_shape).quantize(
+                    Quantize(calibration_samples=calibration_samples)
+                )
                 if quantized
                 else Tester(module, inputs)
             )
@@ -114,7 +122,9 @@ def _test_conv1d(
         # For some tests we want to skip to_executorch because otherwise it will require the
         # quantized operators to be loaded and we don't want to do that in the test.
         if not skip_to_executorch:
-            tester.to_executorch().serialize().run_method_and_compare_outputs()
+            tester.to_executorch().serialize().run_method_and_compare_outputs(
+                num_runs=10, atol=0.02, rtol=0.02
+            )
 
     def test_fp16_conv1d(self):
         inputs = (torch.randn(2, 2, 4).to(torch.float16),)
diff --git a/backends/xnnpack/test/tester/tester.py b/backends/xnnpack/test/tester/tester.py
index dc885135bb..7954425602 100644
--- a/backends/xnnpack/test/tester/tester.py
+++ b/backends/xnnpack/test/tester/tester.py
@@ -12,7 +12,7 @@
 import sys
 from abc import ABC, abstractmethod
 from collections import Counter, OrderedDict
-from typing import Any, Callable, Dict, List, Optional, Tuple, Type, Union
+from typing import Any, Callable, Dict, List, Optional, Sequence, Tuple, Type, Union
 
 import torch
 from executorch.backends.xnnpack._passes import XNNPACKPassManager
@@ -146,12 +146,14 @@ def __init__(
         quantizer: Optional[Quantizer] = None,
         quantization_config: Optional[QuantizationConfig] = None,
         calibrate: bool = True,
+        calibration_samples: Optional[Sequence[Any]] = None,
     ):
         self.quantizer = quantizer or XNNPACKQuantizer()
         self.quantization_config = (
             quantization_config or get_symmetric_quantization_config()
         )
         self.calibrate = calibrate
+        self.calibration_samples = calibration_samples
 
         self.quantizer.set_global(self.quantization_config)
 
@@ -168,7 +170,11 @@ def run(
 
         if self.calibrate:
             # Calibrate prepared model to provide data to quantization observers.
-            prepared(*inputs)
+            if self.calibration_samples is not None:
+                for inp in self.calibration_samples:
+                    prepared(*inp)
+            else:
+                prepared(*inputs)
 
         converted = convert_pt2e(prepared)
         self.converted_graph = converted
diff --git a/codegen/tools/gen_selected_op_variants.py b/codegen/tools/gen_selected_op_variants.py
index da1c1215e2..95ae47f6f1 100644
--- a/codegen/tools/gen_selected_op_variants.py
+++ b/codegen/tools/gen_selected_op_variants.py
@@ -17,7 +17,7 @@
 from torchgen.code_template import CodeTemplate
 
 
-ops_and_dtypes_template_str = """((exec_aten::string_view(operator_name).compare("$operator_name") == 0)\n        && ($dtype_checks))"""
+ops_and_dtypes_template_str = """((executorch::aten::string_view(operator_name).compare("$operator_name") == 0)\n        && ($dtype_checks))"""
 ops_and_dtypes_template = CodeTemplate(ops_and_dtypes_template_str)
 
 selected_kernel_dtypes_h_template_str = """#pragma once
@@ -27,7 +27,7 @@
 
 inline constexpr bool should_include_kernel_dtype(
   const char *operator_name,
-  exec_aten::ScalarType scalar_type
+  executorch::aten::ScalarType scalar_type
 ) {
   return $body;
 }
@@ -91,7 +91,8 @@ def write_selected_op_variants(yaml_file_path: str, output_dir: str) -> None:
                 dtype_set = set([x.split(";")[0] for x in tensor_meta])
                 dtype_list = sorted([dtype_enum_to_type[x] for x in dtype_set])
                 conditions = [
-                    "scalar_type == exec_aten::ScalarType::" + x for x in dtype_list
+                    "scalar_type == executorch::aten::ScalarType::" + x
+                    for x in dtype_list
                 ]
             body_parts.append(
                 ops_and_dtypes_template.substitute(
diff --git a/codegen/tools/test/test_gen_selected_op_variants.py b/codegen/tools/test/test_gen_selected_op_variants.py
index 755b413cf5..e6f056e130 100644
--- a/codegen/tools/test/test_gen_selected_op_variants.py
+++ b/codegen/tools/test/test_gen_selected_op_variants.py
@@ -71,13 +71,13 @@ def test_generates_correct_header(self) -> None:
 
 inline constexpr bool should_include_kernel_dtype(
   const char *operator_name,
-  exec_aten::ScalarType scalar_type
+  executorch::aten::ScalarType scalar_type
 ) {
-  return ((exec_aten::string_view(operator_name).compare("add.out") == 0)
-        && (scalar_type == exec_aten::ScalarType::Float || scalar_type == exec_aten::ScalarType::Int))
- || ((exec_aten::string_view(operator_name).compare("mul.out") == 0)
-        && (scalar_type == exec_aten::ScalarType::Float))
- || ((exec_aten::string_view(operator_name).compare("sub.out") == 0)
+  return ((executorch::aten::string_view(operator_name).compare("add.out") == 0)
+        && (scalar_type == executorch::aten::ScalarType::Float || scalar_type == executorch::aten::ScalarType::Int))
+ || ((executorch::aten::string_view(operator_name).compare("mul.out") == 0)
+        && (scalar_type == executorch::aten::ScalarType::Float))
+ || ((executorch::aten::string_view(operator_name).compare("sub.out") == 0)
         && (true));
 }
 """,
@@ -124,7 +124,7 @@ def test_generates_correct_header(self) -> None:
 
 inline constexpr bool should_include_kernel_dtype(
   const char *operator_name,
-  exec_aten::ScalarType scalar_type
+  executorch::aten::ScalarType scalar_type
 ) {
   return true;
 }
diff --git a/devtools/bundled_program/bundled_program.cpp b/devtools/bundled_program/bundled_program.cpp
index 54f84f6fef..1da42aa95d 100644
--- a/devtools/bundled_program/bundled_program.cpp
+++ b/devtools/bundled_program/bundled_program.cpp
@@ -23,10 +23,10 @@
 #include <executorch/runtime/executor/method.h>
 #include <executorch/runtime/platform/log.h>
 
-using exec_aten::ArrayRef;
-using exec_aten::Half;
-using exec_aten::ScalarType;
-using exec_aten::Tensor;
+using executorch::aten::ArrayRef;
+using executorch::aten::Half;
+using executorch::aten::ScalarType;
+using executorch::aten::Tensor;
 using ::executorch::runtime::Error;
 using ::executorch::runtime::EValue;
 using ::executorch::runtime::Method;
@@ -67,16 +67,16 @@ TensorImpl impl_like(bundled_program_flatbuffer::Tensor* bundled_tensor) {
   ScalarType scalar_type =
       static_cast<ScalarType>(bundled_tensor->scalar_type());
   ssize_t dim = bundled_tensor->sizes()->size();
-  exec_aten::SizesType* sizes = bundled_tensor->mutable_sizes()->data();
+  executorch::aten::SizesType* sizes = bundled_tensor->mutable_sizes()->data();
   void* data = bundled_tensor->mutable_data()->data();
-  exec_aten::DimOrderType* dim_order =
+  executorch::aten::DimOrderType* dim_order =
       bundled_tensor->mutable_dim_order()->data();
 
   // The strides of created tensorimpl will only be actually used when
   // comparsion (`tensor_are_close` below). To eliminate the usage of memory
   // allocator, here we set the initial strides as null and reconstruct the
   // stride array as temporary varible when comparsion.
-  exec_aten::StridesType* strides = nullptr;
+  executorch::aten::StridesType* strides = nullptr;
   return TensorImpl(scalar_type, dim, sizes, data, dim_order, strides);
 }
 #endif
@@ -165,7 +165,7 @@ bool tensors_are_close(
 
   // Contruct stride array for bundled tensor based on its dim order since
   // strides of bundled_tensor in lean mode is null.
-  exec_aten::StridesType strides[kMaxDim] = {0};
+  executorch::aten::StridesType strides[kMaxDim] = {0};
   auto status = torch::executor::dim_order_to_stride(
       bundled_tensor.sizes().data(),
       bundled_tensor.dim_order().data(),
@@ -176,7 +176,7 @@ bool tensors_are_close(
 
   // TODO(T132992348): support comparison between tensors of different strides
   ET_CHECK_MSG(
-      ArrayRef<exec_aten::StridesType>(strides, bundled_tensor.dim()) ==
+      ArrayRef<executorch::aten::StridesType>(strides, bundled_tensor.dim()) ==
           method_output_tensor.strides(),
       "The two inputs of `tensors_are_close` function shall have same strides");
 #endif
diff --git a/devtools/etdump/etdump_flatcc.cpp b/devtools/etdump/etdump_flatcc.cpp
index c8e55b18d7..a34b5188c5 100644
--- a/devtools/etdump/etdump_flatcc.cpp
+++ b/devtools/etdump/etdump_flatcc.cpp
@@ -19,7 +19,7 @@
 
 #include <flatcc/flatcc_types.h>
 
-using ::exec_aten::Tensor;
+using ::executorch::aten::Tensor;
 using ::executorch::runtime::AllocatorID;
 using ::executorch::runtime::ArrayRef;
 using ::executorch::runtime::ChainID;
@@ -37,27 +37,27 @@ namespace etdump {
 namespace {
 
 executorch_flatbuffer_ScalarType_enum_t get_flatbuffer_scalar_type(
-    exec_aten::ScalarType tensor_scalar_type) {
+    executorch::aten::ScalarType tensor_scalar_type) {
   switch (tensor_scalar_type) {
-    case exec_aten::ScalarType::Byte:
+    case executorch::aten::ScalarType::Byte:
       return executorch_flatbuffer_ScalarType_BYTE;
-    case exec_aten::ScalarType::Char:
+    case executorch::aten::ScalarType::Char:
       return executorch_flatbuffer_ScalarType_CHAR;
-    case exec_aten::ScalarType::Short:
+    case executorch::aten::ScalarType::Short:
       return executorch_flatbuffer_ScalarType_SHORT;
-    case exec_aten::ScalarType::Float:
+    case executorch::aten::ScalarType::Float:
       return executorch_flatbuffer_ScalarType_FLOAT;
-    case exec_aten::ScalarType::Int:
+    case executorch::aten::ScalarType::Int:
       return executorch_flatbuffer_ScalarType_INT;
-    case exec_aten::ScalarType::Long:
+    case executorch::aten::ScalarType::Long:
       return executorch_flatbuffer_ScalarType_LONG;
-    case exec_aten::ScalarType::Double:
+    case executorch::aten::ScalarType::Double:
       return executorch_flatbuffer_ScalarType_DOUBLE;
-    case exec_aten::ScalarType::Bool:
+    case executorch::aten::ScalarType::Bool:
       return executorch_flatbuffer_ScalarType_BOOL;
-    case exec_aten::ScalarType::Bits16:
+    case executorch::aten::ScalarType::Bits16:
       return executorch_flatbuffer_ScalarType_BITS16;
-    case exec_aten::ScalarType::UInt16:
+    case executorch::aten::ScalarType::UInt16:
       return executorch_flatbuffer_ScalarType_UINT16;
     default:
       ET_CHECK_MSG(
@@ -69,7 +69,7 @@ executorch_flatbuffer_ScalarType_enum_t get_flatbuffer_scalar_type(
 
 etdump_Tensor_ref_t add_tensor_entry(
     flatcc_builder_t* builder_,
-    const exec_aten::Tensor& tensor,
+    const executorch::aten::Tensor& tensor,
     long offset) {
   etdump_Tensor_start(builder_);
 
@@ -508,7 +508,7 @@ void ETDumpGen::set_debug_buffer(Span<uint8_t> buffer) {
   debug_buffer_ = buffer;
 }
 
-size_t ETDumpGen::copy_tensor_to_debug_buffer(exec_aten::Tensor tensor) {
+size_t ETDumpGen::copy_tensor_to_debug_buffer(executorch::aten::Tensor tensor) {
   if (tensor.nbytes() == 0) {
     return static_cast<size_t>(-1);
   }
@@ -536,7 +536,7 @@ void ETDumpGen::log_evalue(const EValue& evalue, LoggedEValueType evalue_type) {
 
   switch (evalue.tag) {
     case Tag::Tensor: {
-      exec_aten::Tensor tensor = evalue.toTensor();
+      executorch::aten::Tensor tensor = evalue.toTensor();
       long offset = copy_tensor_to_debug_buffer(tensor);
       etdump_Tensor_ref_t tensor_ref =
           add_tensor_entry(builder_, tensor, offset);
@@ -555,7 +555,8 @@ void ETDumpGen::log_evalue(const EValue& evalue, LoggedEValueType evalue_type) {
     }
 
     case Tag::ListTensor: {
-      exec_aten::ArrayRef<exec_aten::Tensor> tensors = evalue.toTensorList();
+      executorch::aten::ArrayRef<executorch::aten::Tensor> tensors =
+          evalue.toTensorList();
       etdump_Tensor_vec_start(builder_);
       for (size_t i = 0; i < tensors.size(); ++i) {
         long offset = copy_tensor_to_debug_buffer(tensors[i]);
diff --git a/devtools/etdump/etdump_flatcc.h b/devtools/etdump/etdump_flatcc.h
index 4a818d18e5..d778106653 100644
--- a/devtools/etdump/etdump_flatcc.h
+++ b/devtools/etdump/etdump_flatcc.h
@@ -106,7 +106,7 @@ class ETDumpGen : public ::executorch::runtime::EventTracer {
   virtual void log_intermediate_output_delegate(
       const char* name,
       ::executorch::runtime::DebugHandle delegate_debug_index,
-      const exec_aten::Tensor& output) override;
+      const executorch::aten::Tensor& output) override;
 
   /**
    * Log an intermediate tensor array output from a delegate.
@@ -114,7 +114,8 @@ class ETDumpGen : public ::executorch::runtime::EventTracer {
   virtual void log_intermediate_output_delegate(
       const char* name,
       ::executorch::runtime::DebugHandle delegate_debug_index,
-      const ::executorch::runtime::ArrayRef<exec_aten::Tensor> output) override;
+      const ::executorch::runtime::ArrayRef<executorch::aten::Tensor> output)
+      override;
 
   /**
    * Log an intermediate int output from a delegate.
@@ -157,7 +158,7 @@ class ETDumpGen : public ::executorch::runtime::EventTracer {
 
   void check_ready_to_add_events();
   int64_t create_string_entry(const char* name);
-  size_t copy_tensor_to_debug_buffer(exec_aten::Tensor tensor);
+  size_t copy_tensor_to_debug_buffer(executorch::aten::Tensor tensor);
 
   /**
    * Templated helper function used to log various types of intermediate output.
diff --git a/devtools/etdump/tests/etdump_test.cpp b/devtools/etdump/tests/etdump_test.cpp
index f45652ab8f..664a5ee1a0 100644
--- a/devtools/etdump/tests/etdump_test.cpp
+++ b/devtools/etdump/tests/etdump_test.cpp
@@ -20,8 +20,8 @@
 #include <cstring>
 #include <memory>
 
-using ::exec_aten::ScalarType;
-using ::exec_aten::Tensor;
+using ::executorch::aten::ScalarType;
+using ::executorch::aten::Tensor;
 using ::executorch::etdump::ETDumpGen;
 using ::executorch::etdump::ETDumpResult;
 using ::executorch::runtime::AllocatorID;
@@ -205,12 +205,12 @@ TEST_F(ProfilerETDumpTest, DebugEvent) {
 TEST_F(ProfilerETDumpTest, DebugEventTensorList) {
   for (size_t i = 0; i < 2; i++) {
     TensorFactory<ScalarType::Int> tf;
-    exec_aten::Tensor storage[2] = {tf.ones({3, 2}), tf.ones({3, 2})};
+    executorch::aten::Tensor storage[2] = {tf.ones({3, 2}), tf.ones({3, 2})};
     EValue evalue_1(storage[0]);
     EValue evalue_2(storage[1]);
     EValue* values_p[2] = {&evalue_1, &evalue_2};
 
-    BoxedEvalueList<exec_aten::Tensor> a_box(values_p, storage, 2);
+    BoxedEvalueList<executorch::aten::Tensor> a_box(values_p, storage, 2);
     EValue evalue(a_box);
     evalue.tag = Tag::ListTensor;
 
diff --git a/devtools/visualization/__init__.py b/devtools/visualization/__init__.py
index 645cc5d537..df1d74c7fa 100644
--- a/devtools/visualization/__init__.py
+++ b/devtools/visualization/__init__.py
@@ -8,4 +8,5 @@
     ModelExplorerServer,
     SingletonModelExplorerServer,
     visualize,
+    visualize_graph,
 )
diff --git a/devtools/visualization/visualization_utils.py b/devtools/visualization/visualization_utils.py
index 4d520a6636..d21d11082a 100644
--- a/devtools/visualization/visualization_utils.py
+++ b/devtools/visualization/visualization_utils.py
@@ -6,9 +6,13 @@
 
 import subprocess
 import time
+from typing import Any, Callable, Type
 
 from executorch.exir import EdgeProgramManager, ExecutorchProgramManager
+from executorch.exir.program._program import _update_exported_program_graph_module
+from torch._export.verifier import Verifier
 from torch.export.exported_program import ExportedProgram
+from torch.fx import GraphModule
 
 try:
     from model_explorer import config, consts, visualize_from_config  # type: ignore
@@ -27,7 +31,7 @@ class SingletonModelExplorerServer:
 
     server: None | subprocess.Popen = None
     num_open: int = 0
-    wait_after_start = 2.0
+    wait_after_start = 3.0
 
     def __init__(self, open_in_browser: bool = True, port: int | None = None):
         if SingletonModelExplorerServer.server is None:
@@ -124,3 +128,29 @@ def visualize(
         no_open_in_browser=no_open_in_browser,
         **kwargs,
     )
+
+
+def visualize_graph(
+    graph_module: GraphModule,
+    exported_program: ExportedProgram | EdgeProgramManager | ExecutorchProgramManager,
+    reuse_server: bool = True,
+    no_open_in_browser: bool = False,
+    **kwargs,
+):
+    """Overrides the graph_module of the supplied exported_program with 'graph_module' before visualizing.
+    Also disables validating operators to allow visualizing graphs containing custom ops.
+
+    A typical example is after running passes, which returns a graph_module rather than an ExportedProgram.
+    """
+
+    class _any_op(Verifier):
+        dialect = "ANY_OP"
+
+        def allowed_op_types(self) -> tuple[Type[Any], ...]:
+            return (Callable,)  # type: ignore
+
+    exported_program = _get_exported_program(exported_program)
+    exported_program = _update_exported_program_graph_module(
+        exported_program, graph_module, override_verifiers=[_any_op]
+    )
+    visualize(exported_program, reuse_server, no_open_in_browser, **kwargs)
diff --git a/devtools/visualization/visualization_utils_test.py b/devtools/visualization/visualization_utils_test.py
index dafefa7dfd..d49c6d2f72 100644
--- a/devtools/visualization/visualization_utils_test.py
+++ b/devtools/visualization/visualization_utils_test.py
@@ -8,6 +8,7 @@
 
 import pytest
 import torch
+from executorch.backends.arm._passes.decompose_linear_pass import DecomposeLinearPass
 from executorch.backends.xnnpack.test.tester import Tester
 
 from executorch.devtools.visualization import (
@@ -15,8 +16,9 @@
     SingletonModelExplorerServer,
     visualization_utils,
     visualize,
+    visualize_graph,
 )
-from executorch.exir import ExportedProgram
+from executorch.exir import ExportedProgram, to_edge_transform_and_lower
 
 try:
     from model_explorer.config import ModelExplorerConfig  # type: ignore
@@ -145,6 +147,17 @@ def test_visualize_to_executorch(server):
         )
 
 
+def test_visualize_graph(server):
+    with server():
+        model = Linear(20, 30)
+        exported_program = torch.export.export(model, model.get_inputs())
+        exported_program = to_edge_transform_and_lower(
+            exported_program
+        ).exported_program()
+        modified_gm = DecomposeLinearPass()(exported_program.graph_module).graph_module
+        visualize_graph(modified_gm, exported_program)
+
+
 if __name__ == "__main__":
     """A test to run locally to make sure that the web browser opens up
     automatically as intended.
@@ -158,3 +171,7 @@ def test_visualize_to_executorch(server):
         test_visualize_to_edge(SingletonModelExplorerServer)
         test_visualize_partition(SingletonModelExplorerServer)
         test_visualize_to_executorch(SingletonModelExplorerServer)
+        test_visualize_graph(SingletonModelExplorerServer)
+
+        # Sleep to give the server time to load the last graph before killing it.
+        time.sleep(3.0)
diff --git a/docs/source/_static/img/et-logo.png b/docs/source/_static/img/et-logo.png
new file mode 100644
index 0000000000..b7995a5db7
Binary files /dev/null and b/docs/source/_static/img/et-logo.png differ
diff --git a/docs/source/_static/img/swiftpm_xcode1.png b/docs/source/_static/img/swiftpm_xcode1.png
index 61859c38fa..11b9c23782 100644
Binary files a/docs/source/_static/img/swiftpm_xcode1.png and b/docs/source/_static/img/swiftpm_xcode1.png differ
diff --git a/docs/source/apple-runtime.md b/docs/source/apple-runtime.md
index fe744add52..4114b78060 100644
--- a/docs/source/apple-runtime.md
+++ b/docs/source/apple-runtime.md
@@ -25,7 +25,7 @@ The prebuilt ExecuTorch runtime, backend, and kernels are available as a [Swift
 
 #### Xcode
 
-In Xcode, go to `File > Add Package Dependencies`. Paste the URL of the [ExecuTorch repo](https://github.com/pytorch/executorch) into the search bar and select it. Make sure to change the branch name to the desired ExecuTorch version in format "swiftpm-<version>", (e.g. "swiftpm-0.4.0"), or a branch name in format "swiftpm-<version>.<year_month_date>" (e.g. "swiftpm-0.4.0-20241201") for a nightly build on a specific date.
+In Xcode, go to `File > Add Package Dependencies`. Paste the URL of the [ExecuTorch repo](https://github.com/pytorch/executorch) into the search bar and select it. Make sure to change the branch name to the desired ExecuTorch version in format "swiftpm-<version>", (e.g. "swiftpm-0.5.0"), or a branch name in format "swiftpm-<version>.<year_month_date>" (e.g. "swiftpm-0.5.0-20250130") for a nightly build on a specific date.
 
 ![](_static/img/swiftpm_xcode1.png)
 
@@ -58,7 +58,7 @@ let package = Package(
   ],
   dependencies: [
     // Use "swiftpm-<version>.<year_month_day>" branch name for a nightly build.
-    .package(url: "https://github.com/pytorch/executorch.git", branch: "swiftpm-0.4.0")
+    .package(url: "https://github.com/pytorch/executorch.git", branch: "swiftpm-0.5.0")
   ],
   targets: [
     .target(
diff --git a/docs/source/getting-started-faqs.md b/docs/source/getting-started-faqs.md
new file mode 100644
index 0000000000..e103309f71
--- /dev/null
+++ b/docs/source/getting-started-faqs.md
@@ -0,0 +1,56 @@
+# FAQs and Common Issues
+
+This page summarizes frequently asked questions and provides guidance on issues that commonly occur when adopting ExecuTorch.
+
+If a specific issue is not covered here, consider searching for or creating an issue on GitHub under [Issues](https://github.com/pytorch/executorch/issues) or [Discussions](https://github.com/pytorch/executorch/discussions).
+
+## Export
+
+### Missing out variants: { _ }
+
+The model likely contains torch custom operators. Custom ops need an Executorch implementation and need to be loaded at export time. See the [ExecuTorch Custom Ops Documentation](https://pytorch.org/executorch/main/kernel-library-custom-aten-kernel.html#apis) for details on how to do this.
+
+### RuntimeError: PyTorch convert function for op _ not implemented
+
+The model likely contains an operator that is not yet supported on ExecuTorch. In this case, consider searching for or creating an issue on [GitHub](https://github.com/pytorch/executorch/issues).
+
+## Runtime
+
+ExecuTorch error codes are defined in [executorch/core/runtime/error.h](https://github.com/pytorch/executorch/blob/main/runtime/core/error.h).
+
+### Inference is Slow / Performance Troubleshooting
+
+If building the runtime from source, ensure that the build is done in release mode. For CMake builds, this can be done by passing `-DCMAKE_BUILD_TYPE=Release`.
+
+Ensure the model is delegated. If not targeting a specific accelerator, use the XNNPACK delegate for CPU performance. Undelegated operators will typically fall back to the ExecuTorch portable library, which is designed as a fallback, and is not intended for performance sensitive operators. To target XNNPACK, pass an `XnnpackPartitioner` to `to_edge_transform_and_lower`. See [Building and Running ExecuTorch with XNNPACK Backend](https://pytorch.org/executorch/main/tutorial-xnnpack-delegate-lowering.html) for more information.
+
+Thread count can have a significant impact on CPU performance. The optimal thread count may depend on the model and application. By default, ExecuTorch will currently use as many threads as there are cores. Consider setting the thread count to cores / 2, or just set to 4 on mobile CPUs.
+
+Thread count can be set with the following function. Ensure this is done prior to loading or running a model.
+```
+::executorch::extension::threadpool::get_threadpool()->_unsafe_reset_threadpool(num_threads);
+```
+
+For a deeper investgiation into model performance, ExecuTorch supports operator-level performance profiling. See [Using the ExecuTorch Developer Tools to Profile a Model](https://pytorch.org/executorch/main/tutorials/devtools-integration-tutorial.html) for more information.
+
+### Missing Logs
+
+ExecuTorch provides hooks to route runtime logs. By default, logs are sent to stdout/stderr, but users can override `et_pal_emit_log_message` to route logs to a custom destination. The Android and iOS extensions also provide out-of-box log routing to the appropriate platform logs. See [Runtime Platform Abstraction Layer (PAL)](https://pytorch.org/executorch/main/runtime-platform-abstraction-layer.html) for more information.
+
+### Error setting input: 0x10 / Attempted to resize a bounded tensor...
+
+This usually means the inputs provided do not match the shape of the example inputs used during model export. If the model is expected to handle varying size inputs (dynamic shapes), make sure the model export specifies the appropriate bounds. See [Expressing Dynamism](https://pytorch.org/docs/stable/export.html#expressing-dynamism) for more information on specifying dynamic shapes.
+
+### Error 0x14 (Operator Missing)
+
+This usually means that the selective build configuration is incorrect. Ensure that the operator library is generated from the current version of the model and the corresponding `et_operator_library` is a dependency of the app-level `executorch_generated_lib` and the generated lib is linked into the application.
+
+This can also occur if the ExecuTorch portable library does not yet have an implementation of the given ATen operator. In this case, consider search for or creating an issue on [GitHub](https://github.com/pytorch/executorch/issues).
+
+### Error 0x20 (Not Found)
+
+This error can occur for a few reasons, but the most common is a missing backend target. Ensure the appropriate backend target is linked. For XNNPACK, this is `xnnpack_backend`. If the backend is linked but is still not available, try linking with --whole-archive: `-Wl,--whole-archive libxnnpack_backend.a -Wl,--no-whole-archive`.
+
+### Duplicate Kernel Registration Abort
+
+This manifests as a crash call stack including ExecuTorch kernel registration and failing with an `et_pal_abort`. This typically means there are multiple `gen_operators_lib` targets linked into the applications. There must be only one generated operator library per target, though each model can have its own `gen_selected_ops/generate_bindings_for_kernels` call.
diff --git a/docs/source/index.rst b/docs/source/index.rst
index b3c69dd9e7..ea3cf5d827 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -87,7 +87,7 @@ Topics in this section will help you get started with ExecuTorch.
    getting-started-setup
    export-overview
    runtime-build-and-cross-compilation
-
+   getting-started-faqs
 
 .. toctree::
    :glob:
diff --git a/docs/source/native-delegates-executorch-xnnpack-delegate.md b/docs/source/native-delegates-executorch-xnnpack-delegate.md
index de54de7706..6bfbfa6be3 100644
--- a/docs/source/native-delegates-executorch-xnnpack-delegate.md
+++ b/docs/source/native-delegates-executorch-xnnpack-delegate.md
@@ -70,7 +70,7 @@ Since weight packing creates an extra copy of the weights inside XNNPACK, We fre
 When executing the XNNPACK subgraphs, we prepare the tensor inputs and outputs and feed them to the XNNPACK runtime graph. After executing the runtime graph, the output pointers are filled with the computed tensors.
 
 #### **Profiling**
-We have enabled basic profiling for XNNPACK delegate that can be enabled with the following compiler flag `-DENABLE_XNNPACK_PROFILING`. With ExecuTorch's Developer Tools integration, you can also now use the Developer Tools to profile the model. You can follow the steps in [Using the ExecuTorch Developer Tools to Profile a Model](./tutorials/devtools-integration-tutorial) on how to profile ExecuTorch models and use Developer Tools' Inspector API to view XNNPACK's internal profiling information.
+We have enabled basic profiling for the XNNPACK delegate that can be enabled with the compiler flag `-DEXECUTORCH_ENABLE_EVENT_TRACER` (add `-DENABLE_XNNPACK_PROFILING` for additional details). With ExecuTorch's Developer Tools integration, you can also now use the Developer Tools to profile the model. You can follow the steps in [Using the ExecuTorch Developer Tools to Profile a Model](./tutorials/devtools-integration-tutorial) on how to profile ExecuTorch models and use Developer Tools' Inspector API to view XNNPACK's internal profiling information. An example implementation is available in the `xnn_executor_runner` (see [tutorial here](tutorial-xnnpack-delegate-lowering.md#profiling)).
 
 
 [comment]: <> (TODO: Refactor quantizer to a more official quantization doc)
diff --git a/docs/source/tutorial-xnnpack-delegate-lowering.md b/docs/source/tutorial-xnnpack-delegate-lowering.md
index 1c71a6ba80..d1148511c5 100644
--- a/docs/source/tutorial-xnnpack-delegate-lowering.md
+++ b/docs/source/tutorial-xnnpack-delegate-lowering.md
@@ -177,3 +177,6 @@ Now you should be able to find the executable built at `./cmake-out/backends/xnn
 
 ## Building and Linking with the XNNPACK Backend
 You can build the XNNPACK backend [CMake target](https://github.com/pytorch/executorch/blob/main/backends/xnnpack/CMakeLists.txt#L83), and link it with your application binary such as an Android or iOS application. For more information on this you may take a look at this [resource](demo-apps-android.md) next.
+
+## Profiling
+To enable profiling in the `xnn_executor_runner` pass the flags `-DEXECUTORCH_ENABLE_EVENT_TRACER=ON` and `-DEXECUTORCH_BUILD_DEVTOOLS=ON` to the build command (add `-DENABLE_XNNPACK_PROFILING=ON` for additional details). This will enable ETDump generation when running the inference and enables command line flags for profiling (see `xnn_executor_runner --help` for details).
diff --git a/examples/apple/coreml/executor_runner/main.mm b/examples/apple/coreml/executor_runner/main.mm
index 35608dd092..1824458e34 100644
--- a/examples/apple/coreml/executor_runner/main.mm
+++ b/examples/apple/coreml/executor_runner/main.mm
@@ -249,8 +249,8 @@ Args parse_command_line_args(NSArray<NSString *> *args) {
         }
         Buffer buffer(tensor_meta->nbytes(), 0);
         auto sizes = tensor_meta->sizes();
-        exec_aten::TensorImpl tensor_impl(tensor_meta->scalar_type(), std::size(sizes), const_cast<int *>(sizes.data()), buffer.data());
-        exec_aten::Tensor tensor(&tensor_impl);
+        executorch::aten::TensorImpl tensor_impl(tensor_meta->scalar_type(), std::size(sizes), const_cast<int *>(sizes.data()), buffer.data());
+        executorch::aten::Tensor tensor(&tensor_impl);
         EValue input_value(std::move(tensor));
         Error err = method.set_input(input_value, i);
         if (err != Error::Ok) {
diff --git a/examples/arm/setup.sh b/examples/arm/setup.sh
index 49729fdbf6..8b4cd275e4 100755
--- a/examples/arm/setup.sh
+++ b/examples/arm/setup.sh
@@ -7,28 +7,15 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-set -eu
-
-if [[ "${1:-'.'}" == "-h" || "${#}" -gt 2 ]]; then
-    echo "Usage: $(basename $0) <--i-agree-to-the-contained-eula> [path-to-a-scratch-dir]"
-    echo "Supplied args: $*"
-    exit 1
-fi
-
-
-########
-### Helper functions
-########
-ARCH="$(uname -m)"
-OS="$(uname -s)"
-
-
+set -u
 
 ########
 ### Hardcoded constants
 ########
 script_dir=$(cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd)
 et_dir=$(realpath $script_dir/../..)
+ARCH="$(uname -m)"
+OS="$(uname -s)"
 
 if [[ "${ARCH}" == "x86_64" ]]; then
     # FVPs
@@ -78,39 +65,40 @@ tosa_reference_model_rev="v0.80.1"
 
 # vela
 vela_repo_url="https://gitlab.arm.com/artificial-intelligence/ethos-u/ethos-u-vela"
-vela_rev="fc970e3da72e5f6930b840b357684126602b3126"
+vela_rev="e131bf4f528f0d461868229972e07f371dcbc881"
 
-########
-### Mandatory user args
-########
-eula_acceptance="${1:-'.'}"
-if [[ "${eula_acceptance}" != "--i-agree-to-the-contained-eula" ]]; then
-    if [[ ${ARM_FVP_INSTALL_I_AGREE_TO_THE_CONTAINED_EULA} != "True" ]]; then
-	echo "Must pass first positional argument '--i-agree-to-the-contained-eula' to agree to EULA associated with downloading the FVP. Exiting!"
-	exit 1
-    else
-	echo "Arm EULA for FVP agreed to with ARM_FVP_INSTALL_I_AGREE_TO_THE_CONTAINED_EULA=True environment variable"
-    fi
-else
-    shift; # drop this arg
-fi
 
 ########
 ### Optional user args
 ########
-root_dir=${1:-"${script_dir}/ethos-u-scratch"}
+root_dir=${2:-"${script_dir}/ethos-u-scratch"}
 mkdir -p ${root_dir}
 root_dir=$(realpath ${root_dir})
+setup_path_script="${root_dir}/setup_path.sh"
+
 
 ########
 ### Functions
 ########
 
 function setup_fvp() {
+
+    # Mandatory user arg --i-agree-to-the-contained-eula
+    eula_acceptance="${1:-'.'}"
+    if [[ "${eula_acceptance}" != "--i-agree-to-the-contained-eula" ]]; then
+        if [[ ${ARM_FVP_INSTALL_I_AGREE_TO_THE_CONTAINED_EULA} != "True" ]]; then
+        echo "Must pass first positional argument '--i-agree-to-the-contained-eula' to agree to EULA associated with downloading the FVP. Exiting!"
+        exit 1
+        else
+        echo "Arm EULA for FVP agreed to with ARM_FVP_INSTALL_I_AGREE_TO_THE_CONTAINED_EULA=True environment variable"
+        fi
+    else
+        shift; # drop this arg
+    fi
     if [[ "${OS}" != "Linux" ]]; then
         echo "[${FUNCNAME[0]}] Warning: FVP only supported with Linux OS, skipping FVP setup..."
         echo "[${FUNCNAME[0]}] Warning: For MacOS, using https://github.com/Arm-Examples/FVPs-on-Mac is recommended."
-        echo "[${FUNCNAME[0]}] Warning:   Follow the instructions and make sure the path is set correctly." 
+        echo "[${FUNCNAME[0]}] Warning:   Follow the instructions and make sure the path is set correctly."
         return 1
     fi
 
@@ -148,17 +136,7 @@ function setup_fvp() {
                 exit 1
                 ;;
         esac
-
-        model_dir_variable=${fvp}_model_dir
-        fvp_model_dir=${!model_dir_variable}
-        fvp_bin_path="$(cd models/${fvp_model_dir} && pwd)"
-        export PATH=${PATH}:${fvp_bin_path}
-
-        echo "export PATH=\${PATH}:${fvp_bin_path}" >> ${setup_path_script}
     done
-
-    # Fixup for Corstone-320 python dependency
-    echo "export LD_LIBRARY_PATH=${root_dir}/FVP-corstone320/python/lib/" >> ${setup_path_script}
 }
 
 function setup_toolchain() {
@@ -173,10 +151,6 @@ function setup_toolchain() {
     echo "[${FUNCNAME[0]}] Installing toolchain ..."
     rm -rf "${toolchain_dir}"
     tar xf "${toolchain_dir}.tar.xz"
-    toolchain_bin_path="$(cd ${toolchain_dir}/bin && pwd)"
-    export PATH=${PATH}:${toolchain_bin_path}
-    hash arm-none-eabi-gcc
-    echo "export PATH=\${PATH}:${toolchain_bin_path}" >> ${setup_path_script}
 }
 
 function setup_tosa_reference_model() {
@@ -188,48 +162,81 @@ function setup_tosa_reference_model() {
 }
 
 function setup_vela() {
-    #
-    # Prepare the Vela compiler for AoT to Ethos-U compilation
-    #
     pip install ethos-u-vela@git+${vela_repo_url}@${vela_rev}
 }
 
+function setup_path() {
+    echo $setup_path_script
+}
+
+function create_setup_path(){
+    echo "" > "${setup_path_script}"
+    fvps=("corstone300" "corstone320")
+    for fvp in "${fvps[@]}"; do
+        model_dir_variable=${fvp}_model_dir
+        fvp_model_dir=${!model_dir_variable}
+        fvp_bin_path="${root_dir}/FVP-${fvp}/models/${fvp_model_dir}"
+        echo "export PATH=\${PATH}:${fvp_bin_path}" >> ${setup_path_script}
+    done
+
+    # Fixup for Corstone-320 python dependency
+    echo "export LD_LIBRARY_PATH=${root_dir}/FVP-corstone320/python/lib/" >> ${setup_path_script}
+
+    toolchain_bin_path="$(cd ${toolchain_dir}/bin && pwd)"
+    echo "export PATH=\${PATH}:${toolchain_bin_path}" >> ${setup_path_script}
+
+    echo "hash FVP_Corstone_SSE-300_Ethos-U55" >> ${setup_path_script}
+    echo "hash FVP_Corstone_SSE-300_Ethos-U65" >> ${setup_path_script}
+    echo "hash FVP_Corstone_SSE-320" >> ${setup_path_script}
+}
+
 ########
 ### main
 ########
-# do basic checks
-# Make sure we are on a supported platform
-if [[ "${ARCH}" != "x86_64" ]] && [[ "${ARCH}" != "aarch64" ]] \
-    && [[ "${ARCH}" != "arm64" ]]; then
-    echo "[main] Error: only x86-64 & aarch64 architecture is supported for now!"
-    exit 1
-fi
+# Only run this if script is executed, not if it is sourced
+(return 0 2>/dev/null) && is_script_sourced=1 || is_script_sourced=0
+if [[ $is_script_sourced -eq 0 ]]
+    then
+    set -e
+    if [[ "${ARCH}" != "x86_64" ]] && [[ "${ARCH}" != "aarch64" ]] \
+        && [[ "${ARCH}" != "arm64" ]]; then
+        echo "[main] Error: only x86-64 & aarch64 architecture is supported for now!"
+        exit 1
+    fi
 
-cd "${script_dir}"
+    # Make sure we are on a supported platform
+    if [[ "${1:-'.'}" == "-h" || "${#}" -gt 2 ]]; then
+        echo "Usage: $(basename $0) <--i-agree-to-the-contained-eula> [path-to-a-scratch-dir]"
+        echo "Supplied args: $*"
+        exit 1
+    fi
 
-# Setup the root dir
-cd "${root_dir}"
-echo "[main] Using root dir ${root_dir}"
+    cd "${script_dir}"
 
-setup_path_script="${root_dir}/setup_path.sh"
-echo "" > "${setup_path_script}"
+    # Setup the root dir
+    cd "${root_dir}"
+    echo "[main] Using root dir ${root_dir}"
+
+    # Import utils
+    source $et_dir/backends/arm/scripts/utils.sh
 
-# Import utils
-source $et_dir/backends/arm/scripts/utils.sh
+    # Setup FVP
+    setup_fvp ${1:-'.'}
 
-# Setup toolchain
-setup_toolchain
+    # Setup toolchain
+    setup_toolchain
 
-# Setup the tosa_reference_model
-setup_tosa_reference_model
+    # Create new setup_path script only if fvp and toolchain setup went well.
+    create_setup_path
 
-# Setup vela and patch in codegen fixes
-setup_vela
+    # Setup the tosa_reference_model
+    setup_tosa_reference_model
 
-# Setup FVP
-setup_fvp
+    # Setup vela and patch in codegen fixes
+    setup_vela
 
-echo "[main] update path by doing 'source ${setup_path_script}'"
+    echo "[main] update path by doing 'source ${setup_path_script}'"
 
-echo "[main] success!"
-exit 0
+    echo "[main] success!"
+    exit 0
+fi
diff --git a/examples/cadence/operators/facto_util.py b/examples/cadence/operators/facto_util.py
index 304b1c7e72..5e6a58ce9f 100644
--- a/examples/cadence/operators/facto_util.py
+++ b/examples/cadence/operators/facto_util.py
@@ -22,7 +22,16 @@ def apply_tensor_contraints(op_name: str, tensor_constraints: list[object]) -> N
             tensor_constraints.extend(
                 [
                     cp.Dtype.In(lambda deps: [torch.float]),
-                    cp.Rank.Le(lambda deps: 2**3),
+                    cp.Rank.Le(lambda deps: 2**2),
+                    cp.Value.Ge(lambda deps, dtype, struct: -2),
+                    cp.Value.Le(lambda deps, dtype, struct: 2),
+                ]
+            )
+        case "mean.dim":
+            tensor_constraints.extend(
+                [
+                    cp.Dtype.In(lambda deps: [torch.float]),
+                    cp.Rank.Le(lambda deps: 2**2),
                 ]
             )
         case "exp.default":
@@ -86,8 +95,27 @@ def facto_testcase_gen(op_name: str) -> List[Tuple[List[str], OrderedDict[str, s
                         cp.Value.Le(lambda deps, dtype: 2),
                     ]
                 )
+        elif in_spec.type.is_scalar_type():
+            spec.inspec[index].constraints.extend(
+                [
+                    cp.Dtype.In(lambda deps: apply_scalar_contraints(op_name)),
+                ]
+            )
         elif in_spec.type.is_tensor():
             spec.inspec[index].constraints.extend(tensor_constraints)
+        elif in_spec.type.is_dim_list():
+            spec.inspec[index].constraints.extend(
+                [
+                    cp.Length.Ge(lambda deps: 1),
+                    cp.Optional.Eq(lambda deps: False),
+                ]
+            )
+        elif in_spec.type.is_bool():
+            spec.inspec[index].constraints.extend(
+                [
+                    cp.Dtype.In(lambda deps: [torch.bool]),
+                ]
+            )
 
     return [
         (posargs, inkwargs)
diff --git a/examples/cadence/operators/test_g3_ops.py b/examples/cadence/operators/test_g3_ops.py
index 158e13d389..58433cc739 100644
--- a/examples/cadence/operators/test_g3_ops.py
+++ b/examples/cadence/operators/test_g3_ops.py
@@ -259,6 +259,35 @@ def test_g3__softmax_out(
 
         self.run_and_verify(model, (inputs,))
 
+    # pyre-ignore[16]: Module `parameterized.parameterized` has no attribute `expand`.
+    @parameterized.expand([*facto_util.facto_testcase_gen("mean.dim")])
+    def test_g3_mean_dim_out(
+        self,
+        posargs: List[int],
+        inkwargs: OrderedDict[str, str],
+    ) -> None:
+        class Meandim(nn.Module):
+            def forward(
+                self,
+                x: torch.Tensor,
+                dim_list: Tuple[int],
+                keepdim: bool,
+                dtype: torch.dtype = torch.float32,
+            ) -> torch.Tensor:
+                return torch.ops.aten.mean.dim(
+                    x,
+                    dim_list,
+                    keepdim,
+                    dtype=dtype,
+                )
+
+        model = Meandim()
+
+        self.run_and_verify(
+            model,
+            inputs=tuple(posargs),
+        )
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/examples/demo-apps/apple_ios/ExecuTorchDemo/ExecuTorchDemo.xcodeproj/project.pbxproj b/examples/demo-apps/apple_ios/ExecuTorchDemo/ExecuTorchDemo.xcodeproj/project.pbxproj
index aff4c7a74b..f08d61396d 100644
--- a/examples/demo-apps/apple_ios/ExecuTorchDemo/ExecuTorchDemo.xcodeproj/project.pbxproj
+++ b/examples/demo-apps/apple_ios/ExecuTorchDemo/ExecuTorchDemo.xcodeproj/project.pbxproj
@@ -806,7 +806,7 @@
 			isa = XCRemoteSwiftPackageReference;
 			repositoryURL = "https://github.com/pytorch/executorch";
 			requirement = {
-				branch = "swiftpm-0.4.0.20241120";
+				branch = "swiftpm-0.5.0.20250130";
 				kind = branch;
 			};
 		};
diff --git a/examples/demo-apps/apple_ios/LLaMA/LLaMA.xcodeproj/project.pbxproj b/examples/demo-apps/apple_ios/LLaMA/LLaMA.xcodeproj/project.pbxproj
index 0145d7745f..2cc9380879 100644
--- a/examples/demo-apps/apple_ios/LLaMA/LLaMA.xcodeproj/project.pbxproj
+++ b/examples/demo-apps/apple_ios/LLaMA/LLaMA.xcodeproj/project.pbxproj
@@ -808,7 +808,7 @@
 			isa = XCRemoteSwiftPackageReference;
 			repositoryURL = "https://github.com/pytorch/executorch";
 			requirement = {
-				branch = "swiftpm-0.4.0.20241120";
+				branch = "swiftpm-0.5.0.20250130";
 				kind = branch;
 			};
 		};
diff --git a/examples/demo-apps/apple_ios/LLaMA/docs/delegates/mps_README.md b/examples/demo-apps/apple_ios/LLaMA/docs/delegates/mps_README.md
index bfe66bbd4e..e1a1530acf 100644
--- a/examples/demo-apps/apple_ios/LLaMA/docs/delegates/mps_README.md
+++ b/examples/demo-apps/apple_ios/LLaMA/docs/delegates/mps_README.md
@@ -76,7 +76,7 @@ sudo /Applications/CMake.app/Contents/bin/cmake-gui --install
 The prebuilt ExecuTorch runtime, backend, and kernels are available as a Swift PM package.
 
 ### Xcode
-Open the project in Xcode.In Xcode, go to `File > Add Package Dependencies`. Paste the URL of the ExecuTorch repo into the search bar and select it. Make sure to change the branch name to the desired ExecuTorch version, e.g., “0.4.0”, or just use the “latest” branch name for the latest stable build.
+Open the project in Xcode.In Xcode, go to `File > Add Package Dependencies`. Paste the URL of the ExecuTorch repo into the search bar and select it. Make sure to change the branch name to the desired ExecuTorch version, e.g., “swiftpm-0.5.0”, or a branch name in format "swiftpm-<version>.<year_month_date>" (e.g. "swiftpm-0.5.0-20250130") for a nightly build on a specific date.
 
 Note: If you're running into any issues related to package dependencies, quit Xcode entirely, delete the whole executorch repo, clean the caches by running the command below in terminal and clone the repo again.
 
diff --git a/examples/demo-apps/apple_ios/LLaMA/docs/delegates/xnnpack_README.md b/examples/demo-apps/apple_ios/LLaMA/docs/delegates/xnnpack_README.md
index b357628042..784ebe50f8 100644
--- a/examples/demo-apps/apple_ios/LLaMA/docs/delegates/xnnpack_README.md
+++ b/examples/demo-apps/apple_ios/LLaMA/docs/delegates/xnnpack_README.md
@@ -130,9 +130,9 @@ While we recommended using the latest prebuilt package pre-configured with the X
 
 Go to Project Navigator, click on LLaMA. `Project --> LLaMA --> Package Dependencies`, and update the package dependencies to any of the available options below:
 
-- Branch --> swiftpm-0.4.0.20241120 (amend to match the latest nightly build)
-- Branch --> 0.4.0
-- Branch --> 0.3.0
+- Branch --> swiftpm-0.5.0.20250130 (amend to match the latest nightly build)
+- Branch --> swiftpm-0.5.0
+- Branch --> swiftpm-0.4.0
 
 ### 2.2 Manually build the package locally and link them
 
diff --git a/examples/demo-apps/react-native/rnllama/ios/rnllama.xcodeproj/project.pbxproj b/examples/demo-apps/react-native/rnllama/ios/rnllama.xcodeproj/project.pbxproj
index 489fa4d9f7..1a58797064 100644
--- a/examples/demo-apps/react-native/rnllama/ios/rnllama.xcodeproj/project.pbxproj
+++ b/examples/demo-apps/react-native/rnllama/ios/rnllama.xcodeproj/project.pbxproj
@@ -947,7 +947,7 @@
 			isa = XCRemoteSwiftPackageReference;
 			repositoryURL = "https://github.com/pytorch/executorch.git";
 			requirement = {
-				branch = "swiftpm-0.4.0.20241120";
+				branch = "swiftpm-0.5.0.20250130";
 				kind = branch;
 			};
 		};
diff --git a/examples/models/deepseek-r1-distill-llama-8B/README.md b/examples/models/deepseek-r1-distill-llama-8B/README.md
new file mode 100644
index 0000000000..3a7a723c73
--- /dev/null
+++ b/examples/models/deepseek-r1-distill-llama-8B/README.md
@@ -0,0 +1,72 @@
+# Summary
+This example demonstrates how to run [Deepseek R1 Distill Llama 8B](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Llama-8B) 3.8B model via ExecuTorch. The architecture of this distilled model is exactly the same as Llama and thus all the instructions mentioned in the [Llama README](../llama/README.md) apply as is.
+
+# Instructions
+## Step 1: Setup
+1. Follow the [tutorial](https://pytorch.org/executorch/main/getting-started-setup) to set up ExecuTorch. For installation run `./install_executorch.sh`
+
+2. Run the installation step for Llama specific requirements
+```
+./examples/models/llama/install_requirements.sh
+```
+
+## Step 2: Prepare and run the model
+1. Download the model
+```
+pip install -U "huggingface_hub[cli]"
+huggingface-cli download deepseek-ai/DeepSeek-R1-Distill-Llama-8B --local-dir /target_dir/DeepSeek-R1-Distill-Llama-8B --local-dir-use-symlinks False
+```
+
+2. Download the [tokenizer.model](https://huggingface.co/meta-llama/Llama-3.1-8B/blob/main/original/tokenizer.model) from the Llama3.1 repo which will be needed later on when running the model using the runtime.
+
+3. Convert the model to pth file.
+```
+pip install torchtune
+```
+
+Run this python code:
+```
+from torchtune.models import convert_weights
+from torchtune.training import FullModelHFCheckpointer
+import torch
+
+# Convert from safetensors to TorchTune. Suppose the model has been downloaded from Hugging Face
+checkpointer = FullModelHFCheckpointer(
+    checkpoint_dir='/target_dir/DeepSeek-R1-Distill-Llama-8B ',
+    checkpoint_files=['model-00001-of-000002.safetensors', 'model-00002-of-000002.safetensors'],
+    output_dir='/tmp/deepseek-ai/DeepSeek-R1-Distill-Llama-8B/' ,
+    model_type='LLAMA3' # or other types that TorchTune supports
+)
+
+print("loading checkpoint")
+sd = checkpointer.load_checkpoint()
+
+# Convert from TorchTune to Meta (PyTorch native)
+sd = convert_weights.tune_to_meta(sd['model'])
+
+print("saving checkpoint")
+torch.save(sd, "/tmp/deepseek-ai/DeepSeek-R1-Distill-Llama-8B/checkpoint.pth")
+```
+
+4. Download and save the params.json file
+```
+wget https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct/blob/main/original/params.json -o /tmp/params.json
+```
+
+5. Generate a PTE file for use with the Llama runner.
+```
+python -m examples.models.llama.export_llama \
+    --checkpoint /tmp/deepseek-ai/DeepSeek-R1-Distill-Llama-8B/checkpoint.pth \
+	-p /tmp/params.json \
+	-kv \
+	--use_sdpa_with_kv_cache \
+	-X \
+	-qmode 8da4w \
+	--group_size 128 \
+	-d fp16 \
+	--metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' \
+	--embedding-quantize 4,32 \
+	--output_name="DeepSeek-R1-Distill-Llama-8B.pte"
+```
+
+6. Run the model on your desktop for validation or integrate with iOS/Android apps. Instructions for these are available in the Llama [README](../llama/README.md) starting at Step 3.
diff --git a/examples/models/llama/TARGETS b/examples/models/llama/TARGETS
index 4fe7f6cc2b..f6b78e876c 100644
--- a/examples/models/llama/TARGETS
+++ b/examples/models/llama/TARGETS
@@ -14,6 +14,8 @@ runtime.python_library(
     srcs = [
         "llama_transformer.py",
         "rope.py",
+        "attention.py",
+        "model_args.py",
     ],
     _is_external_target = True,
     base_module = "executorch.examples.models.llama",
diff --git a/examples/models/llama/attention.py b/examples/models/llama/attention.py
new file mode 100644
index 0000000000..ec55f2f1ee
--- /dev/null
+++ b/examples/models/llama/attention.py
@@ -0,0 +1,255 @@
+from abc import ABC, abstractmethod
+from typing import Any, Dict, Optional, Tuple, Type, TypedDict
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from executorch.examples.models.llama.model_args import ModelArgs
+from executorch.examples.models.llama.rope import Rope
+
+
+class ForwardOptions(TypedDict, total=False):
+    """Optional parameters for `Attention.forward` (compative with Python 3.10 and plus)."""
+
+    mask: Optional[torch.Tensor]
+    input_pos: Optional[torch.Tensor]
+    in_cache_state: Optional[Any]
+    out_cache_state: Optional[Any]
+
+
+class Attention(nn.Module, ABC):
+    """Abstract base class for attention mechanisms with unified interface."""
+
+    @abstractmethod
+    def forward(
+        self,
+        x: torch.Tensor,
+        freqs_cos: torch.Tensor,
+        freqs_sin: torch.Tensor,
+        **kwargs: ForwardOptions,
+    ) -> Tuple[torch.Tensor, Optional[Any]]:
+        """Forward pass for attention mechanism.
+
+        Args:
+            x: Input tensor of shape (batch_size, seq_len, dim)
+            freqs_cos, freqs_sin: Rotary position embedding frequencies
+            ForwardOptions: grouped optional args
+
+        Returns:
+            Tuple of (output tensor, updated cache state)
+        """
+        pass
+
+
+ATTENTION_REGISTRY: Dict[str, Type[Attention]] = {}
+
+
+def register_attention(name: str):
+    """Decorator to register attention classes"""
+
+    def decorator(cls: Type[Attention]):
+        ATTENTION_REGISTRY[name.lower()] = cls
+        return cls
+
+    return decorator
+
+
+class KVCache(nn.Module):
+    def __init__(
+        self,
+        max_batch_size: int,
+        max_context_length: int,
+        n_heads: int,
+        head_dim: int,
+        enable_dynamic_shape: bool,
+        dtype=torch.float32,
+    ):
+        super().__init__()
+        self.max_context_length = max_context_length
+        cache_shape = (max_batch_size, n_heads, max_context_length, head_dim)
+
+        self.max_batch_size = max_batch_size
+        self.n_heads = n_heads
+        self.head_dim = head_dim
+        self.enable_dynamic_shape = enable_dynamic_shape
+        self.register_buffer(
+            "k_cache", torch.zeros(cache_shape, dtype=dtype, device="cpu")
+        )
+        self.register_buffer(
+            "v_cache", torch.zeros(cache_shape, dtype=dtype, device="cpu")
+        )
+
+    def update(
+        self, input_pos: torch.Tensor, k_val: torch.Tensor, v_val: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        # input_pos: [S], k_val: [B, H, S, D]
+        if self.enable_dynamic_shape:
+            start_pos = input_pos[0].item()
+            torch._check_is_size(start_pos)
+            torch._check(start_pos < self.max_context_length)
+            dim_to_slice = 2
+            seq_length = k_val.size(dim_to_slice)
+            # Replace the entry in the cache for this token
+            # The following lines are equivalent to:
+            # cache_k[:bsz, start_pos : start_pos + seqlen] = xk
+            # cache_v[:bsz, start_pos : start_pos + seqlen] = xv
+            # when dim_to_slice is 1
+            # We use .narrow() here to make the compiler happy
+            # pyre-ignore: Incompatible parameter type [6]
+            narrowed_k = self.k_cache.narrow(dim_to_slice, start_pos, seq_length)
+            # pyre-ignore: Incompatible parameter type [6]
+            narrowed_v = self.v_cache.narrow(dim_to_slice, start_pos, seq_length)
+
+            narrowed_k.copy_(k_val)
+            narrowed_v.copy_(v_val)
+            return self.k_cache, self.v_cache
+        else:
+            k_out = self.k_cache
+            v_out = self.v_cache
+            k_out[:, :, input_pos] = k_val
+            v_out[:, :, input_pos] = v_val
+
+            return k_out, v_out
+
+
+class SDPA(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        head_dim: int,
+        n_rep: int,
+        max_context_len: int,
+        enable_dynamic_shape: bool,
+    ):
+        super().__init__()
+        self.dim = dim
+        self.head_dim = head_dim
+        self.n_rep = n_rep
+        self.max_context_len = max_context_len
+        self.enable_dynamic_shape = enable_dynamic_shape
+
+    def forward(
+        self,
+        input_pos: torch.Tensor,
+        q: torch.Tensor,  # Already have rotary embeddings. (bs, n_local_heads, seqlen, head_dim)
+        k: torch.Tensor,  # Already have rotary embeddings. (bs, n_local_kv_heads, seqlen, head_dim)
+        v: torch.Tensor,  # (bs, n_local_kv_heads, seqlen, head_dim)
+        bsz,
+        seqlen,
+        mask: torch.Tensor,
+    ) -> torch.Tensor:
+        if self.enable_dynamic_shape:
+            start_pos = input_pos[-1].item()
+            torch._check_is_size(start_pos)
+            torch._check(start_pos < self.max_context_len)
+            seq_length = q.size(2)
+            # pyre-ignore: Incompatible parameter type [6]
+            attn_mask = mask.narrow(0, start_pos, seq_length)
+        else:
+            attn_mask = mask[None, None, input_pos]
+
+        # TODO(kimishpatel): This should not be necessary because scaled_dot_product_attention
+        # can natively support GQA now. But needs enable_gqa=True
+        k = k.repeat_interleave(self.n_rep, dim=1)
+        v = v.repeat_interleave(self.n_rep, dim=1)
+        y = F.scaled_dot_product_attention(q, k, v, attn_mask=attn_mask, dropout_p=0.0)
+
+        return y.transpose(1, 2).contiguous().view(bsz, seqlen, self.dim)
+
+
+@register_attention("mha")
+class AttentionMHA(Attention):
+    def __init__(self, args: ModelArgs, layer_id: int, rope: Rope):
+        super().__init__()
+        self.use_kv_cache = args.use_kv_cache
+        self.n_heads = args.n_heads
+        self.n_kv_heads = self.n_heads if args.n_kv_heads is None else args.n_kv_heads
+        assert self.n_heads % self.n_kv_heads == 0
+        model_parallel_size = 1
+        self.n_local_heads = self.n_heads // model_parallel_size
+        self.n_local_kv_heads = self.n_kv_heads // model_parallel_size
+        self.n_rep = self.n_local_heads // self.n_local_kv_heads
+        self.head_dim = args.head_dim
+        self.max_batch_size = args.max_batch_size
+        self.max_context_len = args.max_context_len
+        self.dim = args.dim
+        self.wq = nn.Linear(self.dim, self.n_heads * self.head_dim, bias=False)
+        self.wk = nn.Linear(self.dim, self.n_kv_heads * self.head_dim, bias=False)
+        self.wv = nn.Linear(self.dim, self.n_kv_heads * self.head_dim, bias=False)
+        self.wo = nn.Linear(self.n_heads * self.head_dim, self.dim, bias=False)
+
+        self.layer_id = layer_id
+
+        self.rope = rope
+
+        causal_mask = torch.tril(
+            torch.ones(
+                self.max_context_len,
+                self.max_context_len,
+                dtype=torch.bool,
+                device="cpu",
+            )
+        )
+        self.register_buffer("mask", causal_mask, persistent=False)
+
+        if self.use_kv_cache:
+            self.kv_cache = KVCache(
+                args.max_batch_size,
+                args.max_context_len,
+                self.n_kv_heads,
+                self.head_dim,
+                args.enable_dynamic_shape,
+            )
+            self.SDPA = SDPA(
+                dim=self.n_local_heads * self.head_dim,
+                head_dim=self.head_dim,
+                n_rep=self.n_rep,
+                max_context_len=self.max_context_len,
+                enable_dynamic_shape=args.enable_dynamic_shape,
+            )
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        freqs_cos: torch.Tensor,
+        freqs_sin: torch.Tensor,
+        **kwargs: ForwardOptions,
+    ) -> Tuple[torch.Tensor, Optional[Any]]:
+        input_pos = kwargs.get("input_pos")
+        bsz, seqlen, _ = x.shape
+
+        # QKV
+        q, k, v = self.wq(x), self.wk(x), self.wv(x)
+        # We need view_copy elimination
+        q = q.view(bsz, seqlen, self.n_local_heads, self.head_dim)
+        k = k.view(bsz, seqlen, self.n_local_kv_heads, self.head_dim)
+        v = v.view(bsz, seqlen, self.n_local_kv_heads, self.head_dim)
+
+        # RoPE relative positional embeddings
+        q, k = self.rope.forward(q, k, freqs_cos, freqs_sin)
+
+        q = q.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)
+        k = k.transpose(1, 2)
+        v = v.transpose(1, 2)
+
+        if self.use_kv_cache:
+            assert input_pos is not None
+            k, v = self.kv_cache.update(input_pos, k, v)
+            output = self.SDPA(input_pos, q, k, v, bsz, seqlen, self.mask)
+            return self.wo(output)
+
+        # grouped multiquery attention: expand out keys and values
+        k = k.repeat_interleave(self.n_rep, dim=1)
+        v = v.repeat_interleave(self.n_rep, dim=1)
+
+        assert hasattr(self, "mask")
+
+        mask = self.mask[:seqlen, :seqlen]
+
+        output = F.scaled_dot_product_attention(q, k, v, attn_mask=mask, dropout_p=0.0)
+
+        output = output.transpose(1, 2).contiguous().view(bsz, seqlen, -1)
+
+        output = self.wo(output)
+
+        return output
diff --git a/examples/models/llama/export_llama_lib.py b/examples/models/llama/export_llama_lib.py
index c25dce6ffc..618c74e870 100644
--- a/examples/models/llama/export_llama_lib.py
+++ b/examples/models/llama/export_llama_lib.py
@@ -21,6 +21,8 @@
 
 import pkg_resources
 import torch
+
+from executorch.backends.vulkan._passes.remove_asserts import remove_asserts
 from executorch.devtools.backend_debug import get_delegation_info
 
 from executorch.devtools.etrecord import generate_etrecord
@@ -335,6 +337,13 @@ def build_args_parser() -> argparse.ArgumentParser:
         help="maximum length sequence to evaluate",
     )
 
+    parser.add_argument(
+        "--max_context_length",
+        type=int,
+        default=128,
+        help="maximum length of context for model to remember",
+    )
+
     parser.add_argument("-2", "--fairseq2", action="store_true")
     parser.add_argument("-v", "--verbose", action="store_true")
     parser.add_argument(
@@ -579,6 +588,7 @@ def _prepare_for_llama_export(args) -> LLMEdgeManager:
             tokenizer_path=args.tokenizer_path,
             verbose=args.verbose,
             max_seq_len=args.max_seq_length,
+            max_context_len=args.max_context_length,
             input_prune_map_path=args.input_prune_map,
             output_prune_map_path=args.output_prune_map,
             metadata_str=args.metadata,
@@ -637,6 +647,11 @@ def _validate_args(args):
     """
     TODO: Combine all the backends under --backend args
     """
+
+    if args.max_context_length < args.max_seq_length:
+        raise ValueError(
+            f"max_context_length {args.max_context_length} must be >= max_seq_len {args.max_seq_length}. max_context_length impacts kv cache size that is used to remember history, while max_seq_length refers to user prompt length. Please use --max_context_length to specify context length."
+        )
     if args.enable_dynamic_shape and (args.coreml or args.mps or args.qnn):
         raise ValueError(
             "Dynamic shape is not supported with coreml, MPS or qnn backends."
@@ -662,6 +677,7 @@ def _validate_args(args):
 
 def _export_llama(args) -> LLMEdgeManager:  # noqa: C901
     _validate_args(args)
+
     pt2e_quant_params, quantizers, quant_dtype = get_quantizer_and_quant_params(args)
 
     # export_to_edge
@@ -713,6 +729,10 @@ def _export_llama(args) -> LLMEdgeManager:  # noqa: C901
         )
         modelname = f"vulkan_{modelname}"
 
+        # Need to remove asserts from the graph to prevent graph breaks
+        # pyre-ignore: Undefined attribute [16]: `Optional` has no attribute `exported_program`.
+        remove_asserts(builder_exported_to_edge.edge_manager.exported_program())
+
     if args.mps:
         partitioners.append(get_mps_partitioner(args.use_kv_cache))
         modelname = f"mps_{modelname}"
@@ -760,13 +780,13 @@ def _export_llama(args) -> LLMEdgeManager:  # noqa: C901
         atten = builder_exported_to_edge.model.layers[0].attention
         if args.use_qnn_sha:
             cache_shape = torch.Size(
-                (atten.max_batch_size, atten.max_seq_len, atten.head_dim)
+                (atten.max_batch_size, atten.max_context_len, atten.head_dim)
             )
         else:
             cache_shape = torch.Size(
                 (
                     atten.max_batch_size,
-                    atten.max_seq_len,
+                    atten.max_context_len,
                     atten.n_kv_heads,
                     atten.head_dim,
                 )
@@ -861,6 +881,7 @@ def _load_llama_model_metadata(
     use_sdpa_with_kv_cache: bool,
     enable_dynamic_shape: bool,
     max_seq_len: int,
+    max_context_len: int,
     n_layers: int,
     vocab_size: int,
     metadata_str: Optional[str] = None,
@@ -870,6 +891,7 @@ def _load_llama_model_metadata(
         "get_bos_id": 3 if is_fairseq2 else 1,
         "get_eos_ids": [3] if is_fairseq2 else [2],
         "get_max_seq_len": max_seq_len,
+        "get_max_context_len": max_context_len,
         "get_n_layers": n_layers,
         "get_vocab_size": vocab_size,
         "use_kv_cache": use_kv_cache,
@@ -904,6 +926,7 @@ def _load_llama_model(
     tokenizer_path: Optional[str] = None,
     verbose: bool = False,
     max_seq_len: int = 128,
+    max_context_len: int = 128,
     input_prune_map_path: Optional[str] = None,
     output_prune_map_path: Optional[str] = None,
     metadata_str: Optional[str] = None,
@@ -948,6 +971,7 @@ def _load_llama_model(
             generate_full_logits=generate_full_logits,
             fairseq2=weight_type == WeightType.FAIRSEQ2,
             max_seq_len=max_seq_len,
+            max_context_len=max_context_len,
             enable_dynamic_shape=enable_dynamic_shape,
             input_prune_map_path=input_prune_map_path,
             output_prune_map_path=output_prune_map_path,
@@ -1006,10 +1030,13 @@ def _load_llama_model(
             # pyre-fixme[6]: For 5th argument expected `ModelArgs` but got
             #  `Union[Tensor, Module]`.
             model.max_seq_len,
-            # pyre-fixme[6]: For 6th argument expected `int` but got `Union[Tensor,
+            # pyre-fixme[6]: For 6th argument expected `ModelArgs` but got
+            #  `Union[Tensor, Module]`.
+            model.max_context_len,
+            # pyre-fixme[6]: For 7th argument expected `int` but got `Union[Tensor,
             #  Module]`.
             model.n_layers,
-            # pyre-fixme[6]: For 7th argument expected `int` but got `Union[Tensor,
+            # pyre-fixme[6]: For 8th argument expected `int` but got `Union[Tensor,
             #  Module]`.
             model.vocab_size,
             metadata_str,
diff --git a/examples/models/llama/llama_transformer.py b/examples/models/llama/llama_transformer.py
index d5661ae400..08526dde19 100644
--- a/examples/models/llama/llama_transformer.py
+++ b/examples/models/llama/llama_transformer.py
@@ -7,19 +7,16 @@
 
 # Please refer to README.md in the same folder for more information.
 
-from dataclasses import dataclass
-from functools import partial
-from typing import Dict, Optional, Tuple
+from typing import Optional
 
 import torch
 import torch.nn.functional as F
 
-from executorch.examples.models.llama.rope import (
-    hf_apply_rotary_emb,
-    hf_precompute_freqs_cis,
-    precompute_freqs_cis,
-    RotaryEmbedding,
-)
+from executorch.examples.models.llama.attention import ATTENTION_REGISTRY
+
+from executorch.examples.models.llama.model_args import ModelArgs
+
+from executorch.examples.models.llama.rope import Rope
 
 from torch import nn
 
@@ -71,359 +68,6 @@ def forward(self, x):
         return output * self.weight
 
 
-def find_multiple(n: int, k: int) -> int:
-    if n % k == 0:
-        return n
-    return n + k - (n % k)
-
-
-@dataclass
-class ModelArgs:
-    dim: int = 4096
-    n_layers: int = 32
-    n_heads: int = 32
-    n_kv_heads: Optional[int] = None
-    vocab_size: int = -1  # defined later by tokenizer
-    hidden_dim: Optional[int] = None
-    head_dim: Optional[int] = None  # Optional customized head_dim
-    multiple_of: int = 256  # make SwiGLU hidden layer size multiple of large power of 2
-    ffn_dim_multiplier: Optional[float] = None
-    norm_eps: float = 1e-5
-    max_batch_size: int = 32
-    max_seq_len: int = 2048
-    moe: bool = False  # True to enable the MoE (Mixture of Experts)
-    num_experts: int = 8  # Number of experts
-    num_activated_experts: int = 2  # Number of experts to activate
-    use_kv_cache: bool = False  # Use key/value cache
-    use_sdpa_with_kv_cache_op: bool = (
-        False  # Use custom sdpa op that updates kv cache in-place
-    )
-    # Generate logits for all inputs. When it's True, it would take big memory usage
-    # at runtime. Enable it only necessary (e.g., use perplexity tools that requires
-    # logits for all input tokens.)
-    generate_full_logits: bool = False
-    enable_dynamic_shape: bool = False  # export model with dynamic shape support
-    # A dictionary mapping from pruned token-id to original token-id
-    input_prune_map: Optional[Dict[int, int]] = None
-    # A dictionary mapping from pruned token-id to original token-id
-    output_prune_map: Optional[Dict[int, int]] = None
-    use_hf_rope: bool = False  # Use HuggingFace's RoPE implementation
-    rope_theta: Optional[float] = (
-        None  # The official name to override self.rope_freq_base.
-    )
-    rope_freq_base: float = 10000.0  # The base frequency for RoPE. Keep it for BC.
-    use_scaled_rope: bool = False  # Use scaled RoPE, introduced in llama3.1.
-    rope_scale_factor: int = 8
-    # Additional Model Metadata needed at runtime
-    bos_idx: int = 1
-    eos_idx: int = 3
-    bos_count: int = -1  # i.e., a single EOS is used as BOS
-    eos_count: int = 2
-
-    quantization_args: Optional[dict] = None
-    lora_args: Optional[dict] = None
-
-    def __post_init__(self):
-        if self.n_kv_heads is None:
-            self.n_kv_heads = self.n_heads
-
-        # rope_theta overrides rope_freq_base since it's the official name.
-        if self.rope_theta is not None:
-            self.rope_freq_base = self.rope_theta
-
-        if self.use_sdpa_with_kv_cache_op:
-            assert self.use_kv_cache, "use_sdpa_with_kv_cache_op requires use_kv_cache"
-
-        if self.hidden_dim is None:
-            # If hidden_dim is not explicitly set in the ModelArgs,
-            # then calculate implicitly based on dim and also multiple of `args.multiple_of`
-            multiple_of = self.multiple_of
-            hidden_dim = 4 * self.dim
-            hidden_dim = int(2 * hidden_dim / 3)
-            if self.ffn_dim_multiplier is not None:
-                hidden_dim = int(self.ffn_dim_multiplier * hidden_dim)
-            self.hidden_dim = find_multiple(hidden_dim, multiple_of)
-
-        if self.head_dim is None:
-            self.head_dim = self.dim // self.n_heads
-
-
-class Rope(torch.nn.Module):
-    def __init__(self, params: ModelArgs):
-        super().__init__()
-        self.params = params
-        if self.params.use_hf_rope:
-            self.precompute_freqs_cis = hf_precompute_freqs_cis
-        else:
-            self.precompute_freqs_cis = partial(
-                precompute_freqs_cis,
-                use_scaled=self.params.use_scaled_rope,
-                scale_factor=self.params.rope_scale_factor,
-            )
-        freqs_cos, freqs_sin = self.precompute_freqs_cis(
-            self.params.head_dim,
-            (
-                self.params.max_seq_len  # Normal llama2.
-                if self.params.ffn_dim_multiplier is None
-                else self.params.max_seq_len * 2  # Sharded checkpoint.
-            ),
-            self.params.rope_freq_base,
-        )
-        self.register_buffer("freqs_cos", freqs_cos, persistent=False)
-        self.register_buffer("freqs_sin", freqs_sin, persistent=False)
-        if self.params.use_hf_rope:
-            self.apply_rotary_emb = hf_apply_rotary_emb
-        else:
-            self.apply_rotary_emb = RotaryEmbedding()
-
-    def forward(
-        self,
-        q: torch.Tensor,
-        k: torch.Tensor,
-        freqs_cos: torch.Tensor,
-        freqs_sin: torch.Tensor,
-    ):
-        return self.apply_rotary_emb(q, k, freqs_cos, freqs_sin)
-
-    def get_freqs(self, input_pos: Optional[torch.Tensor], seq_len: int):
-        """
-        Get the precomputed frequencies for the given input position and sequence length.
-
-        Args:
-            input_pos (torch.Tensor): The input position tensor.
-            seq_len (int): The sequence length.
-
-        Returns:
-            Tuple[torch.Tensor, torch.Tensor]: The precomputed frequencies for the given input position and sequence length.
-        """
-        if self.params.use_kv_cache:
-            assert (
-                input_pos is not None
-            ), "input_pos must be provided when use_kv_cache is True"
-
-            if self.params.enable_dynamic_shape:
-                # when KV cache is used, seqlen is most likely 1. We want to slice from the start_pos.
-                input_pos_item = input_pos[-1].item()
-                torch._check_is_size(input_pos_item)
-                torch._check(input_pos_item < self.params.max_seq_len)
-                # pyre-ignore: Incompatible parameter type [6]: torch.narrow does expect int or Tensor
-                freqs_cos = self.freqs_cos.narrow(0, input_pos_item, seq_len)
-                # pyre-ignore: Incompatible parameter type [6]
-                freqs_sin = self.freqs_sin.narrow(0, input_pos_item, seq_len)
-            else:
-                # When not using dynamic shape, use of the .item results in
-                # symints, due to querying the data from tensor.
-                # this path avoids that for mps backend, although probably mps backend
-                # can support dynamic shape?
-                freqs_cos = self.freqs_cos[input_pos]
-                freqs_sin = self.freqs_sin[input_pos]
-
-        else:
-            assert input_pos is None, "input_pos is unused when use_kv_cache is False"
-            freqs_cos = self.freqs_cos[:seq_len]
-            freqs_sin = self.freqs_sin[:seq_len]
-        return freqs_cos, freqs_sin
-
-
-class KVCache(nn.Module):
-    def __init__(
-        self,
-        max_batch_size: int,
-        max_seq_length: int,
-        n_heads: int,
-        head_dim: int,
-        enable_dynamic_shape: bool,
-        dtype=torch.float32,
-    ):
-        super().__init__()
-        self.max_seq_length = max_seq_length
-        cache_shape = (max_batch_size, n_heads, max_seq_length, head_dim)
-
-        self.max_batch_size = max_batch_size
-        self.n_heads = n_heads
-        self.head_dim = head_dim
-        self.enable_dynamic_shape = enable_dynamic_shape
-        self.register_buffer(
-            "k_cache", torch.zeros(cache_shape, dtype=dtype, device="cpu")
-        )
-        self.register_buffer(
-            "v_cache", torch.zeros(cache_shape, dtype=dtype, device="cpu")
-        )
-
-    def update(
-        self, input_pos: torch.Tensor, k_val: torch.Tensor, v_val: torch.Tensor
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
-        # input_pos: [S], k_val: [B, H, S, D]
-        if self.enable_dynamic_shape:
-            start_pos = input_pos[0].item()
-            torch._check_is_size(start_pos)
-            torch._check(start_pos < self.max_seq_length)
-            dim_to_slice = 2
-            seq_length = k_val.size(dim_to_slice)
-            # Replace the entry in the cache for this token
-            # The following lines are equivalent to:
-            # cache_k[:bsz, start_pos : start_pos + seqlen] = xk
-            # cache_v[:bsz, start_pos : start_pos + seqlen] = xv
-            # when dim_to_slice is 1
-            # We use .narrow() here to make the compiler happy
-            # pyre-ignore: Incompatible parameter type [6]
-            narrowed_k = self.k_cache.narrow(dim_to_slice, start_pos, seq_length)
-            # pyre-ignore: Incompatible parameter type [6]
-            narrowed_v = self.v_cache.narrow(dim_to_slice, start_pos, seq_length)
-
-            narrowed_k.copy_(k_val)
-            narrowed_v.copy_(v_val)
-            return self.k_cache, self.v_cache
-        else:
-            k_out = self.k_cache
-            v_out = self.v_cache
-            k_out[:, :, input_pos] = k_val
-            v_out[:, :, input_pos] = v_val
-
-            return k_out, v_out
-
-
-class SDPA(nn.Module):
-    def __init__(
-        self,
-        dim: int,
-        head_dim: int,
-        n_rep: int,
-        max_seq_len: int,
-        enable_dynamic_shape: bool,
-    ):
-        super().__init__()
-        self.dim = dim
-        self.head_dim = head_dim
-        self.n_rep = n_rep
-        self.max_seq_len = max_seq_len
-        self.enable_dynamic_shape = enable_dynamic_shape
-
-    def forward(
-        self,
-        input_pos: torch.Tensor,
-        q: torch.Tensor,  # Already have rotary embeddings. (bs, n_local_heads, seqlen, head_dim)
-        k: torch.Tensor,  # Already have rotary embeddings. (bs, n_local_kv_heads, seqlen, head_dim)
-        v: torch.Tensor,  # (bs, n_local_kv_heads, seqlen, head_dim)
-        bsz,
-        seqlen,
-        mask: torch.Tensor,
-    ) -> torch.Tensor:
-        if self.enable_dynamic_shape:
-            start_pos = input_pos[-1].item()
-            torch._check_is_size(start_pos)
-            torch._check(start_pos < self.max_seq_len)
-            seq_length = q.size(2)
-            # pyre-ignore: Incompatible parameter type [6]
-            attn_mask = mask.narrow(0, start_pos, seq_length)
-        else:
-            attn_mask = mask[None, None, input_pos]
-
-        # TODO(kimishpatel): This should not be necessary because scaled_dot_product_attention
-        # can natively support GQA now. But needs enable_gqa=True
-        k = k.repeat_interleave(self.n_rep, dim=1)
-        v = v.repeat_interleave(self.n_rep, dim=1)
-        y = F.scaled_dot_product_attention(q, k, v, attn_mask=attn_mask, dropout_p=0.0)
-
-        return y.transpose(1, 2).contiguous().view(bsz, seqlen, self.dim)
-
-
-class Attention(nn.Module):
-    def __init__(self, args: ModelArgs, layer_id: int, rope: Rope):
-        super().__init__()
-        self.use_kv_cache = args.use_kv_cache
-        self.n_heads = args.n_heads
-        self.n_kv_heads = self.n_heads if args.n_kv_heads is None else args.n_kv_heads
-        assert self.n_heads % self.n_kv_heads == 0
-        model_parallel_size = 1
-        self.n_local_heads = self.n_heads // model_parallel_size
-        self.n_local_kv_heads = self.n_kv_heads // model_parallel_size
-        self.n_rep = self.n_local_heads // self.n_local_kv_heads
-        self.head_dim = args.head_dim
-        self.max_batch_size = args.max_batch_size
-        self.max_seq_len = args.max_seq_len
-        self.dim = args.dim
-        self.wq = nn.Linear(self.dim, self.n_heads * self.head_dim, bias=False)
-        self.wk = nn.Linear(self.dim, self.n_kv_heads * self.head_dim, bias=False)
-        self.wv = nn.Linear(self.dim, self.n_kv_heads * self.head_dim, bias=False)
-        self.wo = nn.Linear(self.n_heads * self.head_dim, self.dim, bias=False)
-
-        self.layer_id = layer_id
-
-        self.rope = rope
-
-        causal_mask = torch.tril(
-            torch.ones(
-                self.max_seq_len,
-                self.max_seq_len,
-                dtype=torch.bool,
-                device="cpu",
-            )
-        )
-        self.register_buffer("mask", causal_mask, persistent=False)
-
-        if self.use_kv_cache:
-            self.kv_cache = KVCache(
-                args.max_batch_size,
-                args.max_seq_len,
-                self.n_kv_heads,
-                self.head_dim,
-                args.enable_dynamic_shape,
-            )
-            self.SDPA = SDPA(
-                dim=self.n_local_heads * self.head_dim,
-                head_dim=self.head_dim,
-                n_rep=self.n_rep,
-                max_seq_len=self.max_seq_len,
-                enable_dynamic_shape=args.enable_dynamic_shape,
-            )
-
-    def forward(
-        self,
-        x: torch.Tensor,
-        freqs_cos: torch.Tensor,
-        freqs_sin: torch.Tensor,
-        input_pos: Optional[torch.Tensor] = None,
-    ):
-        bsz, seqlen, _ = x.shape
-
-        # QKV
-        q, k, v = self.wq(x), self.wk(x), self.wv(x)
-        # We need view_copy elimination
-        q = q.view(bsz, seqlen, self.n_local_heads, self.head_dim)
-        k = k.view(bsz, seqlen, self.n_local_kv_heads, self.head_dim)
-        v = v.view(bsz, seqlen, self.n_local_kv_heads, self.head_dim)
-
-        # RoPE relative positional embeddings
-        q, k = self.rope.forward(q, k, freqs_cos, freqs_sin)
-
-        q = q.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)
-        k = k.transpose(1, 2)
-        v = v.transpose(1, 2)
-
-        if self.use_kv_cache:
-            assert input_pos is not None
-            k, v = self.kv_cache.update(input_pos, k, v)
-            output = self.SDPA(input_pos, q, k, v, bsz, seqlen, self.mask)
-            return self.wo(output)
-
-        # grouped multiquery attention: expand out keys and values
-        k = k.repeat_interleave(self.n_rep, dim=1)
-        v = v.repeat_interleave(self.n_rep, dim=1)
-
-        assert hasattr(self, "mask")
-
-        mask = self.mask[:seqlen, :seqlen]
-
-        output = F.scaled_dot_product_attention(q, k, v, attn_mask=mask, dropout_p=0.0)
-
-        output = output.transpose(1, 2).contiguous().view(bsz, seqlen, -1)
-
-        output = self.wo(output)
-
-        return output
-
-
 class FeedForward(nn.Module):
     def __init__(self, args: ModelArgs):
         super().__init__()
@@ -490,7 +134,13 @@ def __init__(self, layer_id: int, args: ModelArgs, rope: Rope):
         self.n_heads = args.n_heads
         self.dim = args.dim
         self.head_dim = args.head_dim
-        self.attention = Attention(args, layer_id, rope)
+        if args.attention_type not in ATTENTION_REGISTRY:
+            raise ValueError(
+                f"Unknown attention type: {args.attention_type}. "
+                f"Available: {list(ATTENTION_REGISTRY.keys())}"
+            )
+        cls = ATTENTION_REGISTRY[args.attention_type]
+        self.attention = cls(args, layer_id, rope)
         if args.moe:
             self.block_sparse_moe = MOEFeedForward(args)
         else:
@@ -500,7 +150,7 @@ def __init__(self, layer_id: int, args: ModelArgs, rope: Rope):
 
     def forward(self, x, freqs_cos, freqs_sin, input_pos=None):  # x: 1xN
         h = self.attention.forward(
-            self.attention_norm(x), freqs_cos, freqs_sin, input_pos
+            self.attention_norm(x), freqs_cos, freqs_sin, input_pos=input_pos
         )
 
         h = x + h
@@ -528,6 +178,7 @@ def __init__(self, params: ModelArgs):
         self.use_kv_cache = params.use_kv_cache
         self.generate_full_logits = params.generate_full_logits
         self.max_seq_len = params.max_seq_len
+        self.max_context_len = params.max_context_len
         self.input_prune_map = params.input_prune_map
         self.output_prune_map = params.output_prune_map
 
diff --git a/examples/models/llama/model.py b/examples/models/llama/model.py
index 9f7994916a..19c7ed0b31 100644
--- a/examples/models/llama/model.py
+++ b/examples/models/llama/model.py
@@ -15,8 +15,9 @@
     get_checkpoint_dtype,
     get_default_model_resource_dir,
 )
+from executorch.examples.models.llama.llama_transformer import Transformer
 
-from executorch.examples.models.llama.llama_transformer import ModelArgs, Transformer
+from executorch.examples.models.llama.model_args import ModelArgs
 
 try:
     from .fairseq2 import convert_to_llama_checkpoint
@@ -52,8 +53,13 @@ def __init__(self, **kwargs):
         self.input_prune_map_path = kwargs.get("input_prune_map_path", None)
         self.output_prune_map_path = kwargs.get("output_prune_map_path", None)
         self.max_seq_len = kwargs.get("max_seq_len", 128)
+        self.max_context_len = kwargs.get("max_context_len", 128)
         self.args = kwargs.get("args", None)
 
+        assert (
+            self.max_context_len >= self.max_seq_len
+        ), f"max_context_len({self.max_context_len}) must be >= max_seq_len({self.max_seq_len})"
+
         # The example is using a dummy small model with random weights for demo purpose only.
         # Follow the instruction in https://github.com/facebookresearch/llama to download the model.
         device = "cpu"
@@ -136,6 +142,7 @@ def __init__(self, **kwargs):
 
         model_args: ModelArgs = ModelArgs(
             max_seq_len=self.max_seq_len,
+            max_context_len=self.max_context_len,
             max_batch_size=1,
             use_kv_cache=self.use_kv_cache,
             use_sdpa_with_kv_cache_op=self.use_sdpa_with_kv_cache_op,
@@ -219,7 +226,7 @@ def __init__(self, **kwargs):
             window_size = int(attention_sink_params[1])
             eviction_batch_size = int(attention_sink_params[2])
 
-            assert self.args.max_seq_length == sink_size + window_size
+            assert self.args.max_context_length == sink_size + window_size
 
             self.model_ = enable_attention_sink(
                 module=self.model_,
diff --git a/examples/models/llama/model_args.py b/examples/models/llama/model_args.py
new file mode 100644
index 0000000000..e1c4edb8e9
--- /dev/null
+++ b/examples/models/llama/model_args.py
@@ -0,0 +1,81 @@
+from dataclasses import dataclass
+from typing import Dict, Optional
+
+
+@dataclass
+class ModelArgs:
+    dim: int = 4096
+    n_layers: int = 32
+    n_heads: int = 32
+    n_kv_heads: Optional[int] = None
+    vocab_size: int = -1  # defined later by tokenizer
+    hidden_dim: Optional[int] = None
+    head_dim: Optional[int] = None  # Optional customized head_dim
+    multiple_of: int = 256  # make SwiGLU hidden layer size multiple of large power of 2
+    ffn_dim_multiplier: Optional[float] = None
+    norm_eps: float = 1e-5
+    max_batch_size: int = 32
+    max_seq_len: int = 2048
+    max_context_len: int = 2048
+    moe: bool = False  # True to enable the MoE (Mixture of Experts)
+    num_experts: int = 8  # Number of experts
+    num_activated_experts: int = 2  # Number of experts to activate
+    attention_type: str = "mha"  # Attention type, registered in attention.py
+    use_kv_cache: bool = False  # Use key/value cache
+    use_sdpa_with_kv_cache_op: bool = (
+        False  # Use custom sdpa op that updates kv cache in-place
+    )
+    # Generate logits for all inputs. When it's True, it would take big memory usage
+    # at runtime. Enable it only necessary (e.g., use perplexity tools that requires
+    # logits for all input tokens.)
+    generate_full_logits: bool = False
+    enable_dynamic_shape: bool = False  # export model with dynamic shape support
+    # A dictionary mapping from pruned token-id to original token-id
+    input_prune_map: Optional[Dict[int, int]] = None
+    # A dictionary mapping from pruned token-id to original token-id
+    output_prune_map: Optional[Dict[int, int]] = None
+    use_hf_rope: bool = False  # Use HuggingFace's RoPE implementation
+    rope_theta: Optional[float] = (
+        None  # The official name to override self.rope_freq_base.
+    )
+    rope_freq_base: float = 10000.0  # The base frequency for RoPE. Keep it for BC.
+    use_scaled_rope: bool = False  # Use scaled RoPE, introduced in llama3.1.
+    rope_scale_factor: int = 8
+    # Additional Model Metadata needed at runtime
+    bos_idx: int = 1
+    eos_idx: int = 3
+    bos_count: int = -1  # i.e., a single EOS is used as BOS
+    eos_count: int = 2
+
+    quantization_args: Optional[dict] = None
+    lora_args: Optional[dict] = None
+
+    def __post_init__(self):
+        if self.n_kv_heads is None:
+            self.n_kv_heads = self.n_heads
+
+        # rope_theta overrides rope_freq_base since it's the official name.
+        if self.rope_theta is not None:
+            self.rope_freq_base = self.rope_theta
+
+        if self.use_sdpa_with_kv_cache_op:
+            assert self.use_kv_cache, "use_sdpa_with_kv_cache_op requires use_kv_cache"
+
+        if self.hidden_dim is None:
+            # If hidden_dim is not explicitly set in the ModelArgs,
+            # then calculate implicitly based on dim and also multiple of `args.multiple_of`
+            multiple_of = self.multiple_of
+            hidden_dim = 4 * self.dim
+            hidden_dim = int(2 * hidden_dim / 3)
+            if self.ffn_dim_multiplier is not None:
+                hidden_dim = int(self.ffn_dim_multiplier * hidden_dim)
+
+            def find_multiple(n: int, k: int) -> int:
+                if n % k == 0:
+                    return n
+                return n + k - (n % k)
+
+            self.hidden_dim = find_multiple(hidden_dim, multiple_of)
+
+        if self.head_dim is None:
+            self.head_dim = self.dim // self.n_heads
diff --git a/examples/models/llama/rope.py b/examples/models/llama/rope.py
index cd3ddb0d3b..01352f404d 100644
--- a/examples/models/llama/rope.py
+++ b/examples/models/llama/rope.py
@@ -8,9 +8,11 @@
 # Different RoPE implementations
 
 import math
+from functools import partial
 from typing import Optional, Tuple
 
 import torch
+from executorch.examples.models.llama.model_args import ModelArgs
 
 # ======================== Stock Implementation ========================
 
@@ -205,3 +207,80 @@ def hf_apply_rotary_emb_to_k(k, cos, sin, position_ids=None, unsqueeze_dim=1):
     sin = sin.unsqueeze(unsqueeze_dim)
     k_embed = (k * cos) + (rotate_half(k) * sin)
     return k_embed
+
+
+class Rope(torch.nn.Module):
+    def __init__(self, params: ModelArgs):
+        super().__init__()
+        self.params = params
+        if self.params.use_hf_rope:
+            self.precompute_freqs_cis = hf_precompute_freqs_cis
+        else:
+            self.precompute_freqs_cis = partial(
+                precompute_freqs_cis,
+                use_scaled=self.params.use_scaled_rope,
+                scale_factor=self.params.rope_scale_factor,
+            )
+        freqs_cos, freqs_sin = self.precompute_freqs_cis(
+            self.params.head_dim,
+            (
+                self.params.max_context_len  # Normal llama2.
+                if self.params.ffn_dim_multiplier is None
+                else self.params.max_context_len * 2  # Sharded checkpoint.
+            ),
+            self.params.rope_freq_base,
+        )
+        self.register_buffer("freqs_cos", freqs_cos, persistent=False)
+        self.register_buffer("freqs_sin", freqs_sin, persistent=False)
+        if self.params.use_hf_rope:
+            self.apply_rotary_emb = hf_apply_rotary_emb
+        else:
+            self.apply_rotary_emb = RotaryEmbedding()
+
+    def forward(
+        self,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        freqs_cos: torch.Tensor,
+        freqs_sin: torch.Tensor,
+    ):
+        return self.apply_rotary_emb(q, k, freqs_cos, freqs_sin)
+
+    def get_freqs(self, input_pos: Optional[torch.Tensor], seq_len: int):
+        """
+        Get the precomputed frequencies for the given input position and sequence length.
+
+        Args:
+            input_pos (torch.Tensor): The input position tensor.
+            seq_len (int): The sequence length.
+
+        Returns:
+            Tuple[torch.Tensor, torch.Tensor]: The precomputed frequencies for the given input position and sequence length.
+        """
+        if self.params.use_kv_cache:
+            assert (
+                input_pos is not None
+            ), "input_pos must be provided when use_kv_cache is True"
+
+            if self.params.enable_dynamic_shape:
+                # when KV cache is used, seqlen is most likely 1. We want to slice from the start_pos.
+                input_pos_item = input_pos[-1].item()
+                torch._check_is_size(input_pos_item)
+                torch._check(input_pos_item < self.params.max_context_len)
+                # pyre-ignore: Incompatible parameter type [6]: torch.narrow does expect int or Tensor
+                freqs_cos = self.freqs_cos.narrow(0, input_pos_item, seq_len)
+                # pyre-ignore: Incompatible parameter type [6]
+                freqs_sin = self.freqs_sin.narrow(0, input_pos_item, seq_len)
+            else:
+                # When not using dynamic shape, use of the .item results in
+                # symints, due to querying the data from tensor.
+                # this path avoids that for mps backend, although probably mps backend
+                # can support dynamic shape?
+                freqs_cos = self.freqs_cos[input_pos]
+                freqs_sin = self.freqs_sin[input_pos]
+
+        else:
+            assert input_pos is None, "input_pos is unused when use_kv_cache is False"
+            freqs_cos = self.freqs_cos[:seq_len]
+            freqs_sin = self.freqs_sin[:seq_len]
+        return freqs_cos, freqs_sin
diff --git a/examples/models/llama/source_transformation/attention.py b/examples/models/llama/source_transformation/attention.py
index 7dc9003f13..d5f065550d 100644
--- a/examples/models/llama/source_transformation/attention.py
+++ b/examples/models/llama/source_transformation/attention.py
@@ -12,7 +12,7 @@
 from typing import List, Optional, Tuple
 
 import torch
-from executorch.examples.models.llama.llama_transformer import Attention
+from executorch.examples.models.llama.attention import Attention
 from torch import nn
 
 
@@ -32,7 +32,7 @@ class KVCacheSHA(torch.nn.Module):
     def __init__(
         self,
         max_batch_size: int,
-        max_seq_length: int,
+        max_context_length: int,
         n_heads: int,
         head_dim: int,
         dtype=torch.float32,
@@ -40,7 +40,7 @@ def __init__(
         super().__init__()
 
         # a buffer per head
-        cache_shape = (max_batch_size, max_seq_length, head_dim)
+        cache_shape = (max_batch_size, max_context_length, head_dim)
         for i in range(n_heads):
             self.register_buffer(
                 f"past_k_caches_{i}",
@@ -79,7 +79,7 @@ class SDPASHA(torch.nn.Module):
     def __init__(
         self,
         max_batch_size: int,
-        max_seq_length: int,
+        max_context_length: int,
         n_heads: int,
         n_rep: int,
         head_dim: int,
@@ -90,7 +90,7 @@ def __init__(
         self.n_rep = n_rep
         self.dim = dim
         self.kv_cache = KVCacheSHA(
-            max_batch_size, max_seq_length, n_heads // n_rep, head_dim
+            max_batch_size, max_context_length, n_heads // n_rep, head_dim
         )
         self.scale_factor = math.sqrt(head_dim)
 
@@ -134,11 +134,11 @@ def __init__(self, attention_mha: nn.Module):
         self.n_rep = self.n_heads // self.n_kv_heads
         self.dim = attention_mha.dim
         self.max_batch_size = attention_mha.max_batch_size
-        self.max_seq_len = attention_mha.max_seq_len
+        self.max_context_len = attention_mha.max_context_len
         self.head_dim = attention_mha.dim // self.n_heads
         self.SDPA = SDPASHA(
             self.max_batch_size,
-            self.max_seq_len,
+            self.max_context_len,
             self.n_heads,
             self.n_rep,
             self.head_dim,
@@ -184,8 +184,8 @@ def __init__(self, attention_mha: nn.Module):
 
         causal_mask = torch.tril(
             torch.ones(
-                self.max_seq_len,
-                self.max_seq_len,
+                self.max_context_len,
+                self.max_context_len,
                 dtype=torch.bool,
                 device="cpu",
             )
diff --git a/examples/models/llama/source_transformation/attention_sink.py b/examples/models/llama/source_transformation/attention_sink.py
index 5b3bfba9ad..22bd8a3e22 100644
--- a/examples/models/llama/source_transformation/attention_sink.py
+++ b/examples/models/llama/source_transformation/attention_sink.py
@@ -12,15 +12,12 @@
 
 import torch
 
-from executorch.examples.models.llama.llama_transformer import (
-    Attention,
-    KVCache,
-    ModelArgs,
-    Rope,
-)
+from executorch.examples.models.llama.attention import AttentionMHA, KVCache
+from executorch.examples.models.llama.model_args import ModelArgs
 from executorch.examples.models.llama.rope import (
     apply_rotary_emb_to_k,
     hf_apply_rotary_emb_to_k,
+    Rope,
 )
 from torchao.quantization.quant_api import _replace_with_custom_fn_if_matches_filter
 
@@ -44,8 +41,8 @@ def __init__(
             self.apply_rotary_emb_to_k = hf_apply_rotary_emb_to_k
         else:
             self.apply_rotary_emb_to_k = apply_rotary_emb_to_k
-        self.max_seq_length = window_size + sink_size
-        assert self.max_seq_length == self.params.max_seq_len
+        self.max_context_length = window_size + sink_size
+        assert self.max_context_length == self.params.max_context_len
         self.eviction_batch_size = eviction_batch_size
         self.position_shift = 0
 
@@ -54,11 +51,14 @@ def get_freqs(self, input_pos: Optional[torch.Tensor], seq_len: int):
 
         input_pos_item = input_pos.item()
         torch._check_is_size(input_pos_item)
-        if input_pos_item + self.position_shift + seq_len > self.max_seq_length:
+        if input_pos_item + self.position_shift + seq_len > self.max_context_length:
             # There are not enough spaces in the cache to store the new tokens.
             # We need to evict some old tokens and shift some recent tokens.
             num_to_evict = max(
-                input_pos_item + self.position_shift - self.max_seq_length + seq_len,
+                input_pos_item
+                + self.position_shift
+                - self.max_context_length
+                + seq_len,
                 self.eviction_batch_size,
             )
             self.position_shift -= num_to_evict  # pyre-ignore [8]
@@ -121,7 +121,7 @@ def __init__(
     ):
         super().__init__(
             max_batch_size=max_batch_size,
-            max_seq_length=window_size + sink_size,
+            max_context_length=window_size + sink_size,
             n_heads=n_heads,
             head_dim=head_dim,
             enable_dynamic_shape=enable_dynamic_shape,
@@ -148,11 +148,14 @@ def evict_tokens(self, input_pos: torch.Tensor, seq_len: int) -> int:
         """
         input_pos_item = input_pos.item()
         torch._check_is_size(input_pos_item)
-        if input_pos_item + self.position_shift + seq_len > self.max_seq_length:
+        if input_pos_item + self.position_shift + seq_len > self.max_context_length:
             # There are not enough spaces in the cache to store the new tokens.
             # We need to evict some old tokens and shift some recent tokens.
             num_to_evict = max(
-                input_pos_item + self.position_shift - self.max_seq_length + seq_len,
+                input_pos_item
+                + self.position_shift
+                - self.max_context_length
+                + seq_len,
                 self.eviction_batch_size,
             )
             num_to_keep = (
@@ -260,7 +263,7 @@ def _replace_attention(
                 eviction_batch_size=eviction_batch_size,
             )
 
-        if isinstance(child_module, Attention):
+        if isinstance(child_module, AttentionMHA):
             kv_cache = child_module.kv_cache
             kv_cache_with_attention_sink = KVCacheWithAttentionSink(
                 n_heads=kv_cache.n_heads,
diff --git a/examples/models/llama/source_transformation/quantized_kv_cache.py b/examples/models/llama/source_transformation/quantized_kv_cache.py
index 90ec9879e5..023fc6800f 100644
--- a/examples/models/llama/source_transformation/quantized_kv_cache.py
+++ b/examples/models/llama/source_transformation/quantized_kv_cache.py
@@ -10,7 +10,7 @@
 
 import torch
 import torch.nn as nn
-from executorch.examples.models.llama.llama_transformer import KVCache
+from executorch.examples.models.llama.attention import KVCache
 
 from torch.ao.quantization.fx._decomposed import quantized_decomposed_lib  # noqa: F401
 
@@ -33,7 +33,7 @@ class QuantizedKVCache(nn.Module):
     def __init__(
         self,
         max_batch_size,
-        max_seq_length,
+        max_context_length,
         n_heads,
         head_dim,
         cache_type: QuantizedCacheType = QuantizedCacheType.AffineSymmetric,
@@ -52,8 +52,8 @@ def __init__(
         self.use_custom_update_cache_op = use_custom_update_cache_op
         self.quantized_cache_dtype = torch.int8
         self.cache_fp_type = torch.float32
-        cache_shape = (max_batch_size, max_seq_length, n_heads, head_dim)
-        scale_shape = (max_batch_size, max_seq_length, n_heads, 1)
+        cache_shape = (max_batch_size, max_context_length, n_heads, head_dim)
+        scale_shape = (max_batch_size, max_context_length, n_heads, 1)
         self.register_buffer(
             "k_cache", torch.zeros(cache_shape, dtype=self.quantized_cache_dtype)
         )
@@ -161,13 +161,15 @@ def from_float(
         cache_type: QuantizedCacheType,
         use_custom_update_cache_op: bool = False,
     ):
-        max_batch_size, n_heads, max_seq_length, head_dim = kv_cache.k_cache.shape
+        max_batch_size, n_heads, max_context_length, head_dim = kv_cache.k_cache.shape
         if isinstance(kv_cache, CustomKVCache):
             # If replacing custom kv cache, then the shape is [B, S, H, D]
-            max_batch_size, max_seq_length, n_heads, head_dim = kv_cache.k_cache.shape
+            max_batch_size, max_context_length, n_heads, head_dim = (
+                kv_cache.k_cache.shape
+            )
         return cls(
             max_batch_size,
-            max_seq_length,
+            max_context_length,
             n_heads,
             head_dim,
             cache_type,
@@ -226,14 +228,14 @@ class CustomKVCache(nn.Module):
     def __init__(
         self,
         max_batch_size: int,
-        max_seq_length: int,
+        max_context_length: int,
         n_heads: int,
         head_dim: int,
         dtype=torch.float32,
     ):
         super().__init__()
-        self.max_seq_length = max_seq_length
-        cache_shape = (max_batch_size, max_seq_length, n_heads, head_dim)
+        self.max_context_length = max_context_length
+        cache_shape = (max_batch_size, max_context_length, n_heads, head_dim)
 
         self.max_batch_size = max_batch_size
         self.n_heads = n_heads
@@ -275,13 +277,13 @@ def replace_kv_cache_with_custom_kv_cache(module):
         if isinstance(child, KVCache):
             cache_shape = child.k_cache.shape
             cache_dtype = child.k_cache.dtype
-            max_batch_size, n_heads, max_seq_length, head_dim = cache_shape
+            max_batch_size, n_heads, max_context_length, head_dim = cache_shape
             setattr(
                 module,
                 name,
                 CustomKVCache(
                     max_batch_size,
-                    max_seq_length,
+                    max_context_length,
                     n_heads,
                     head_dim,
                     dtype=cache_dtype,
diff --git a/examples/models/llama/source_transformation/sdpa.py b/examples/models/llama/source_transformation/sdpa.py
index 6a54d6a119..1bb7d27754 100644
--- a/examples/models/llama/source_transformation/sdpa.py
+++ b/examples/models/llama/source_transformation/sdpa.py
@@ -13,7 +13,7 @@
 
 import torch
 
-from executorch.examples.models.llama.llama_transformer import KVCache, SDPA
+from executorch.examples.models.llama.attention import KVCache, SDPA
 
 
 class SDPACustom(torch.nn.Module):
@@ -268,14 +268,14 @@ class KVCacheCoreML(torch.nn.Module):
     def __init__(
         self,
         max_batch_size: int,
-        max_seq_length: int,
+        max_context_length: int,
         n_heads: int,
         head_dim: int,
         dtype=torch.float32,
     ):
         super().__init__()
-        self.max_seq_length = max_seq_length
-        cache_shape = (max_batch_size, n_heads, max_seq_length, head_dim)
+        self.max_context_length = max_context_length
+        cache_shape = (max_batch_size, n_heads, max_context_length, head_dim)
 
         self.max_batch_size = max_batch_size
         self.n_heads = n_heads
@@ -303,7 +303,7 @@ def replace_kv_cache_with_coreml_kv_cache(module: torch.nn.Module):
                 name,
                 KVCacheCoreML(
                     child.max_batch_size,
-                    child.max_seq_length,
+                    child.max_context_length,
                     child.n_heads,
                     child.head_dim,
                     child.k_cache.dtype,
@@ -318,13 +318,13 @@ class KVCacheSimple(torch.nn.Module):
     def __init__(
         self,
         max_batch_size: int,
-        max_seq_length: int,
+        max_context_length: int,
         n_heads: int,
         head_dim: int,
         dtype=torch.float32,
     ):
         super().__init__()
-        cache_shape = (max_batch_size, max_seq_length, n_heads, head_dim)
+        cache_shape = (max_batch_size, max_context_length, n_heads, head_dim)
         self.register_buffer(
             "past_k_caches",
             torch.zeros(cache_shape, dtype=dtype, device="cpu"),
@@ -358,7 +358,7 @@ def replace_kv_cache_with_simple_kv_cache(module: torch.nn.Module):
                 name,
                 KVCacheSimple(
                     child.max_batch_size,
-                    child.max_seq_length,
+                    child.max_context_length,
                     child.n_heads,
                     child.head_dim,
                     child.k_cache.dtype,
@@ -373,9 +373,9 @@ def replace_causal_mask(module: torch.nn.Module):
     for buffer_fqn_name, buffer in module.named_buffers():
         buffer_name = buffer_fqn_name.split(".")[-1]
         if buffer_name == "mask":
-            max_seq_len = buffer.shape[-1]
+            max_context_len = buffer.shape[-1]
             mask = torch.full(
-                (max_seq_len, max_seq_len),
+                (max_context_len, max_context_len),
                 float("-inf"),
                 device="cpu",
             )
diff --git a/examples/models/llama/source_transformation/test_attention_sink.py b/examples/models/llama/source_transformation/test_attention_sink.py
index 4dd522dff2..fc882ebf4a 100644
--- a/examples/models/llama/source_transformation/test_attention_sink.py
+++ b/examples/models/llama/source_transformation/test_attention_sink.py
@@ -7,7 +7,7 @@
 import unittest
 
 import torch
-from executorch.examples.models.llama.llama_transformer import ModelArgs
+from executorch.examples.models.llama.model_args import ModelArgs
 
 from executorch.examples.models.llama.source_transformation.attention_sink import (
     KVCacheWithAttentionSink,
@@ -29,7 +29,7 @@ def _init_rope(self, params: ModelArgs, eviction_batch_size: int):
     def setUp(self):
         torch.manual_seed(42)
         self.params = ModelArgs(
-            use_kv_cache=True, enable_dynamic_shape=True, max_seq_len=256
+            use_kv_cache=True, enable_dynamic_shape=True, max_context_len=256
         )
         self.rope_with_attention_sink = self._init_rope(
             params=self.params, eviction_batch_size=1
@@ -135,7 +135,7 @@ def _init_cache(self, sink_size, eviction_batch_size):
         self.params = ModelArgs(
             use_kv_cache=True,
             enable_dynamic_shape=True,
-            max_seq_len=self.window_size + sink_size,
+            max_context_len=self.window_size + sink_size,
         )
         self.rope_with_attention_sink = RopeWithAttentionSink(
             params=self.params,
diff --git a/examples/models/llama/source_transformation/test_quantized_kv_cache.py b/examples/models/llama/source_transformation/test_quantized_kv_cache.py
index 67ebbc7b3f..4252518a4e 100644
--- a/examples/models/llama/source_transformation/test_quantized_kv_cache.py
+++ b/examples/models/llama/source_transformation/test_quantized_kv_cache.py
@@ -8,7 +8,7 @@
 
 import torch
 
-from executorch.examples.models.llama.llama_transformer import KVCache
+from executorch.examples.models.llama.attention import KVCache
 
 from executorch.examples.models.llama.source_transformation.quantized_kv_cache import (
     QuantizedCacheType,
@@ -20,7 +20,7 @@ class QuantizedKVCacheTest(unittest.TestCase):
     def _init_cache(self):
         self.kv_cache = KVCache(
             self.max_batch_size,
-            self.max_seq_len,
+            self.max_context_len,
             self.n_kv_heads,
             self.head_dim,
             self.enable_dynamic_shape,
@@ -36,7 +36,7 @@ def _init_kv(self):
     def setUp(self):
         torch.manual_seed(42)
         self.max_batch_size = 1
-        self.max_seq_len = 5
+        self.max_context_len = 5
         self.n_kv_heads = 8
         self.head_dim = 17
         self.enable_dynamic_shape = False
diff --git a/examples/models/llama/source_transformation/test_sdpa_with_quantized_kv_cache.py b/examples/models/llama/source_transformation/test_sdpa_with_quantized_kv_cache.py
index 0081c5072c..35c88e10b6 100644
--- a/examples/models/llama/source_transformation/test_sdpa_with_quantized_kv_cache.py
+++ b/examples/models/llama/source_transformation/test_sdpa_with_quantized_kv_cache.py
@@ -8,7 +8,7 @@
 
 import torch
 
-from executorch.examples.models.llama.llama_transformer import KVCache
+from executorch.examples.models.llama.attention import KVCache
 
 from executorch.examples.models.llama.source_transformation.quantized_kv_cache import (
     CustomKVCache,
@@ -23,7 +23,7 @@ class SDPAWithQuantizedKVCacheTest(unittest.TestCase):
     def _init_cache(self):
         self.kv_cache = KVCache(
             self.max_batch_size,
-            self.max_seq_len,
+            self.max_context_len,
             self.n_kv_heads,
             self.head_dim,
             self.enable_dynamic_shape,
@@ -40,7 +40,7 @@ def _init_cache(self):
         # as a sequence of token positions
         self.custom_kv_cache = CustomKVCache(
             self.max_batch_size,
-            self.max_seq_len,
+            self.max_context_len,
             self.n_kv_heads,
             self.head_dim,
             dtype=self.dtype,
@@ -57,7 +57,7 @@ def _init_kv(self):
     def setUp(self):
         torch.manual_seed(42)
         self.max_batch_size = 1
-        self.max_seq_len = 5
+        self.max_context_len = 5
         self.n_kv_heads = 4
         self.n_heads = 8
         self.head_dim = 17
diff --git a/examples/models/llama/tests/test_pre_quantization_transforms.py b/examples/models/llama/tests/test_pre_quantization_transforms.py
index dc7c640dba..345f3fad9b 100644
--- a/examples/models/llama/tests/test_pre_quantization_transforms.py
+++ b/examples/models/llama/tests/test_pre_quantization_transforms.py
@@ -7,7 +7,8 @@
 import unittest
 
 import torch
-from executorch.examples.models.llama.llama_transformer import ModelArgs, Transformer
+from executorch.examples.models.llama.llama_transformer import Transformer
+from executorch.examples.models.llama.model_args import ModelArgs
 from executorch.examples.models.llama.source_transformation.pre_quantization import (
     sanitize_checkpoint_from_pre_quantization,
     transform_embedding_for_pre_quantization,
diff --git a/examples/models/llama/tests/test_simple_sdpa.py b/examples/models/llama/tests/test_simple_sdpa.py
index 4088165c71..d60bc30b7d 100644
--- a/examples/models/llama/tests/test_simple_sdpa.py
+++ b/examples/models/llama/tests/test_simple_sdpa.py
@@ -7,7 +7,7 @@
 import unittest
 
 import torch
-from executorch.examples.models.llama.llama_transformer import KVCache, SDPA
+from executorch.examples.models.llama.attention import KVCache, SDPA
 from executorch.examples.models.llama.source_transformation.sdpa import SDPASimple
 
 
@@ -15,7 +15,7 @@ class SDPATest(unittest.TestCase):
     def test_simple_sdpa(self):
         # Verify the correctness between the simple SDPA and the original SDPA module defined in llama_transformer.py
         max_batch_size = 1
-        max_seq_length = 128
+        max_context_length = 128
         n_heads = 8
         head_dim = 8
         dim = 64
@@ -25,7 +25,7 @@ def test_simple_sdpa(self):
         n_local_heads = n_heads
         kv_cache = KVCache(
             max_batch_size=max_batch_size,
-            max_seq_length=max_seq_length,
+            max_context_length=max_context_length,
             n_heads=n_heads,
             head_dim=head_dim,
             enable_dynamic_shape=False,
@@ -34,14 +34,14 @@ def test_simple_sdpa(self):
             dim=dim,
             head_dim=head_dim,
             n_rep=n_rep,
-            max_seq_len=max_seq_length,
+            max_context_len=max_context_length,
             enable_dynamic_shape=False,
         )
         input_pos = torch.tensor([0])
         query = torch.randn(1, 1, n_local_heads, head_dim)
         key = torch.randn(1, 1, n_local_heads, head_dim)
         value = torch.randn(1, 1, n_local_heads, head_dim)
-        mask = torch.randn(max_seq_length, max_seq_length)
+        mask = torch.randn(max_context_length, max_context_length)
         query = query.transpose(1, 2)
         key = key.transpose(1, 2)
         value = value.transpose(1, 2)
diff --git a/examples/models/llama3_2_vision/cross_attention/cross_attention_mask_test.cpp b/examples/models/llama3_2_vision/cross_attention/cross_attention_mask_test.cpp
index e2256b14a8..8d144b4f72 100644
--- a/examples/models/llama3_2_vision/cross_attention/cross_attention_mask_test.cpp
+++ b/examples/models/llama3_2_vision/cross_attention/cross_attention_mask_test.cpp
@@ -11,9 +11,9 @@
 #include <gtest/gtest.h>
 
 using namespace ::testing;
-using exec_aten::ScalarType;
-using exec_aten::Tensor;
-using exec_aten::TensorImpl;
+using executorch::aten::ScalarType;
+using executorch::aten::Tensor;
+using executorch::aten::TensorImpl;
 
 TEST(CrossAttentxnMaskTest, TestCrossAttentionMask) {
   std::vector<int> tokens = {
diff --git a/examples/models/llava/model.py b/examples/models/llava/model.py
index 68a9e59e0c..304b49759f 100644
--- a/examples/models/llava/model.py
+++ b/examples/models/llava/model.py
@@ -12,7 +12,8 @@
 
 import requests
 import torch
-from executorch.examples.models.llama.llama_transformer import ModelArgs, Transformer
+from executorch.examples.models.llama.llama_transformer import Transformer
+from executorch.examples.models.llama.model_args import ModelArgs
 
 from executorch.examples.models.llama.source_transformation.quantized_kv_cache import (
     replace_kv_cache_with_custom_kv_cache,
diff --git a/examples/models/llava/runner/llava_image_prefiller.h b/examples/models/llava/runner/llava_image_prefiller.h
index b4b1ef420c..c48fe2b1fe 100644
--- a/examples/models/llava/runner/llava_image_prefiller.h
+++ b/examples/models/llava/runner/llava_image_prefiller.h
@@ -26,7 +26,7 @@ class ET_EXPERIMENTAL LlavaImagePrefiller
    * @param start_pos The starting position in KV cache of the input in the LLM
    * @return logits of the image prefill.
    */
-  inline ::executorch::runtime::Result<exec_aten::Tensor> prefill(
+  inline ::executorch::runtime::Result<executorch::aten::Tensor> prefill(
       ::executorch::extension::llm::Image& image,
       int64_t& start_pos) override {
     auto image_tensor = executorch::extension::from_blob(
diff --git a/examples/models/llava/runner/llava_text_decoder_runner.h b/examples/models/llava/runner/llava_text_decoder_runner.h
index 236d412910..4c7809361b 100644
--- a/examples/models/llava/runner/llava_text_decoder_runner.h
+++ b/examples/models/llava/runner/llava_text_decoder_runner.h
@@ -23,7 +23,7 @@ class ET_EXPERIMENTAL LlavaTextDecoderRunner
       float temperature)
       : TextDecoderRunner(module, true, vocab_size, temperature){};
 
-  inline executorch::runtime::Result<exec_aten::Tensor> step(
+  inline executorch::runtime::Result<executorch::aten::Tensor> step(
       executorch::extension::TensorPtr& tokens,
       executorch::extension::TensorPtr& start_pos) override {
     // run token embedding
diff --git a/examples/models/phi-3-mini/runner.cpp b/examples/models/phi-3-mini/runner.cpp
index ca299d3b11..1163a35d66 100644
--- a/examples/models/phi-3-mini/runner.cpp
+++ b/examples/models/phi-3-mini/runner.cpp
@@ -73,14 +73,15 @@ void Runner::generate(const std::string& prompt, std::size_t max_seq_len) {
   std::cout << std::endl;
 }
 
-uint64_t Runner::logits_to_token(const exec_aten::Tensor& logits_tensor) {
+uint64_t Runner::logits_to_token(
+    const executorch::aten::Tensor& logits_tensor) {
   return sampler_->sample(logits_tensor.data_ptr<float>());
 }
 
 uint64_t Runner::prefill(std::vector<uint64_t>& tokens) {
   auto result = module_->forward(executorch::extension::from_blob(
       tokens.data(),
-      {1, static_cast<exec_aten::SizesType>(tokens.size())},
+      {1, static_cast<executorch::aten::SizesType>(tokens.size())},
       ScalarType::Long));
   ET_CHECK_MSG(result.error() == Error::Ok, "Failed to prefill tokens");
 
diff --git a/examples/models/phi-3-mini/runner.h b/examples/models/phi-3-mini/runner.h
index 9b24f97170..2048acdab2 100644
--- a/examples/models/phi-3-mini/runner.h
+++ b/examples/models/phi-3-mini/runner.h
@@ -38,7 +38,7 @@ class Runner {
   void generate(const std::string& prompt, std::size_t max_seq_len);
 
  private:
-  uint64_t logits_to_token(const exec_aten::Tensor& logits_tensor);
+  uint64_t logits_to_token(const executorch::aten::Tensor& logits_tensor);
   uint64_t prefill(std::vector<uint64_t>& tokens);
   uint64_t run_model_step(uint64_t token);
 
diff --git a/examples/portable/custom_ops/custom_ops_1_out.cpp b/examples/portable/custom_ops/custom_ops_1_out.cpp
index 660107f275..e26dfefe23 100644
--- a/examples/portable/custom_ops/custom_ops_1_out.cpp
+++ b/examples/portable/custom_ops/custom_ops_1_out.cpp
@@ -11,8 +11,8 @@
 namespace custom {
 namespace native {
 
-using exec_aten::ScalarType;
-using exec_aten::Tensor;
+using executorch::aten::ScalarType;
+using executorch::aten::Tensor;
 using executorch::runtime::KernelRuntimeContext;
 
 namespace {
diff --git a/examples/portable/custom_ops/custom_ops_2_out.cpp b/examples/portable/custom_ops/custom_ops_2_out.cpp
index 69436750cc..138a8eeed8 100644
--- a/examples/portable/custom_ops/custom_ops_2_out.cpp
+++ b/examples/portable/custom_ops/custom_ops_2_out.cpp
@@ -11,8 +11,8 @@
 namespace custom {
 namespace native {
 
-using exec_aten::ScalarType;
-using exec_aten::Tensor;
+using executorch::aten::ScalarType;
+using executorch::aten::Tensor;
 using executorch::runtime::KernelRuntimeContext;
 
 namespace {
diff --git a/examples/portable/executor_runner/executor_runner.cpp b/examples/portable/executor_runner/executor_runner.cpp
index 65ba762743..f7702fae3d 100644
--- a/examples/portable/executor_runner/executor_runner.cpp
+++ b/examples/portable/executor_runner/executor_runner.cpp
@@ -1,5 +1,6 @@
 /*
  * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * Copyright 2024-2025 Arm Limited and/or its affiliates.
  * All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
@@ -25,10 +26,14 @@
 #include <executorch/extension/data_loader/file_data_loader.h>
 #include <executorch/extension/evalue_util/print_evalue.h>
 #include <executorch/extension/runner_util/inputs.h>
+#include <executorch/runtime/core/event_tracer.h>
 #include <executorch/runtime/executor/method.h>
 #include <executorch/runtime/executor/program.h>
 #include <executorch/runtime/platform/log.h>
 #include <executorch/runtime/platform/runtime.h>
+#ifdef ET_EVENT_TRACER_ENABLED
+#include <executorch/devtools/etdump/etdump_flatcc.h>
+#endif // ET_EVENT_TRACER_ENABLED
 
 static uint8_t method_allocator_pool[4 * 1024U * 1024U]; // 4 MB
 
@@ -38,10 +43,15 @@ DEFINE_string(
     model_path,
     "model.pte",
     "Model serialized in flatbuffer format.");
+DEFINE_uint32(num_executions, 1, "Number of times to run the model.");
+#ifdef ET_EVENT_TRACER_ENABLED
+DEFINE_string(etdump_path, "model.etdump", "Write ETDump data to this path.");
+#endif // ET_EVENT_TRACER_ENABLED
 
 using executorch::extension::FileDataLoader;
 using executorch::runtime::Error;
 using executorch::runtime::EValue;
+using executorch::runtime::EventTracer;
 using executorch::runtime::HierarchicalAllocator;
 using executorch::runtime::MemoryAllocator;
 using executorch::runtime::MemoryManager;
@@ -51,6 +61,56 @@ using executorch::runtime::Program;
 using executorch::runtime::Result;
 using executorch::runtime::Span;
 
+/// Helper to manage resources for ETDump generation
+class EventTraceManager {
+ public:
+  EventTraceManager() : event_tracer_ptr_(nullptr) {
+#ifdef ET_EVENT_TRACER_ENABLED
+    event_tracer_ptr_ = std::make_shared<executorch::etdump::ETDumpGen>();
+#endif // ET_EVENT_TRACER_ENABLED
+  }
+
+  EventTracer* get_event_tracer() const {
+    return event_tracer_ptr_.get();
+  };
+
+  Error write_etdump_to_file() const {
+    EventTracer* const event_tracer_ptr = get_event_tracer();
+    if (!event_tracer_ptr) {
+      return Error::NotSupported;
+    }
+
+#ifdef ET_EVENT_TRACER_ENABLED
+    executorch::etdump::ETDumpGen* const etdump_ptr =
+        static_cast<executorch::etdump::ETDumpGen*>(event_tracer_ptr);
+
+    const char* filename = FLAGS_etdump_path.c_str();
+
+    std::unique_ptr<FILE, decltype(&fclose)> etdump_file(
+        fopen(filename, "w+"), fclose);
+    if (!etdump_file) {
+      ET_LOG(Error, "Failed to open ETDump file at %s.", filename);
+      return Error::AccessFailed;
+    }
+
+    executorch::etdump::ETDumpResult result = etdump_ptr->get_etdump_data();
+    if (result.buf != nullptr && result.size > 0) {
+      fwrite((uint8_t*)result.buf, 1, result.size, etdump_file.get());
+      free(result.buf);
+      ET_LOG(Info, "ETDump written to file '%s'.", filename);
+    } else {
+      ET_LOG(Error, "No ETDump data available!");
+      return Error::NotFound;
+    }
+#endif // ET_EVENT_TRACER_ENABLED
+
+    return Error::Ok;
+  }
+
+ private:
+  std::shared_ptr<EventTracer> event_tracer_ptr_;
+};
+
 int main(int argc, char** argv) {
   executorch::runtime::runtime_init();
 
@@ -158,8 +218,9 @@ int main(int argc, char** argv) {
   // the method can mutate the memory-planned buffers, so the method should only
   // be used by a single thread at at time, but it can be reused.
   //
-
-  Result<Method> method = program->load_method(method_name, &memory_manager);
+  EventTraceManager tracer;
+  Result<Method> method = program->load_method(
+      method_name, &memory_manager, tracer.get_event_tracer());
   ET_CHECK_MSG(
       method.ok(),
       "Loading of method %s failed with status 0x%" PRIx32,
@@ -178,18 +239,23 @@ int main(int argc, char** argv) {
   ET_LOG(Info, "Inputs prepared.");
 
   // Run the model.
-  Error status = method->execute();
-  ET_CHECK_MSG(
-      status == Error::Ok,
-      "Execution of method %s failed with status 0x%" PRIx32,
-      method_name,
-      (uint32_t)status);
-  ET_LOG(Info, "Model executed successfully.");
+  for (uint32_t i = 0; i < FLAGS_num_executions; i++) {
+    Error status = method->execute();
+    ET_CHECK_MSG(
+        status == Error::Ok,
+        "Execution of method %s failed with status 0x%" PRIx32,
+        method_name,
+        (uint32_t)status);
+  }
+  ET_LOG(
+      Info,
+      "Model executed successfully %" PRIu32 " time(s).",
+      FLAGS_num_executions);
 
   // Print the outputs.
   std::vector<EValue> outputs(method->outputs_size());
   ET_LOG(Info, "%zu outputs: ", outputs.size());
-  status = method->get_outputs(outputs.data(), outputs.size());
+  Error status = method->get_outputs(outputs.data(), outputs.size());
   ET_CHECK(status == Error::Ok);
   // Print the first and last 100 elements of long lists of scalars.
   std::cout << executorch::extension::evalue_edge_items(100);
@@ -197,5 +263,12 @@ int main(int argc, char** argv) {
     std::cout << "Output " << i << ": " << outputs[i] << std::endl;
   }
 
+  if (tracer.get_event_tracer()) {
+    // Dump ETDump data containing profiling/debugging data to file specified in
+    // command line flag.
+    Error status = tracer.write_etdump_to_file();
+    ET_CHECK_MSG(status == Error::Ok, "Failed to save ETDump file.");
+  }
+
   return 0;
 }
diff --git a/examples/qualcomm/CMakeLists.txt b/examples/qualcomm/CMakeLists.txt
index a8e16bb5c9..55969f937e 100644
--- a/examples/qualcomm/CMakeLists.txt
+++ b/examples/qualcomm/CMakeLists.txt
@@ -84,11 +84,8 @@ set(CMAKE_POSITION_INDEPENDENT_CODE ${_pic_flag})
 # build qnn_executor_runner
 add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/executor_runner)
 
-# build qnn_llama_runner for llama2
-add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/oss_scripts/llama2)
-
-# build qnn_llama_runner for llama3.2
-add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/oss_scripts/llama3_2)
+# build qnn_llama_runner for llama
+add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/oss_scripts/llama)
 
 # build qaihub_llama2_7b_runner and qaihub_llama3_8b_runner
 add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/qaihub_scripts/llama)
diff --git a/examples/qualcomm/README.md b/examples/qualcomm/README.md
index 3d5eb42939..bdac58d2bf 100644
--- a/examples/qualcomm/README.md
+++ b/examples/qualcomm/README.md
@@ -4,10 +4,10 @@ This directory contains examples for some AI models.
 
 We have seperated the example scripts into the following subfolders, please refer to [README.md](../../backends/qualcomm/README.md) for the example scripts' directory structure:
 
-1. executor_runner: This folder contains a general executor runner capable of running most of the models. As a rule of thumb, if a model does not have its own customized runner, execute the model using [executor_runner](./executor_runner/qnn_executor_runner.cpp). On the other hand, if a model has its own runner, such as [llama2](./oss_scripts/llama2/qnn_llama_runner.cpp), use the customized runner to execute the model. Customized runner should be located under the same folder as the model's python script.
+1. executor_runner: This folder contains a general executor runner capable of running most of the models. As a rule of thumb, if a model does not have its own customized runner, execute the model using [executor_runner](./executor_runner/qnn_executor_runner.cpp). On the other hand, if a model has its own runner, such as [llama](./oss_scripts/llama/qnn_llama_runner.cpp), use the customized runner to execute the model. Customized runner should be located under the same folder as the model's python script.
 
 2. oss_scripts: OSS stands for Open Source Software. This folder contains python scripts for open source models. Some models under this folder might also have their own customized runner.
-   For example, [llama2](./oss_scripts/llama2/qnn_llama_runner.cpp) contains not only the python scripts to prepare the model but also a customized runner for executing the model.
+   For example, [llama](./oss_scripts/llama/qnn_llama_runner.cpp) contains not only the python scripts to prepare the model but also a customized runner for executing the model.
 
 3. qaihub_scripts: QAIHub stands for [Qualcomm AI Hub](https://aihub.qualcomm.com/). On QAIHub, users can find pre-compiled context binaries, a format used by QNN to save its models. This provides users with a new option for model deployment. Different from oss_scripts & scripts, which the example scripts are converting a model from nn.Module to ExecuTorch .pte files, qaihub_scripts provides example scripts for converting pre-compiled context binaries to ExecuTorch .pte files. Additionaly, users can find customized example runners specific to the QAIHub models for execution. For example [qaihub_llama2_7b](./qaihub_scripts/llama2/qaihub_llama2_7b.py) is a script converting context binaries to ExecuTorch .pte files, and [qaihub_llama2_7b_runner](./qaihub_scripts/llama2/qaihub_llama2_7b_runner.cpp) is a customized example runner to execute llama2 .pte files. Please be aware that context-binaries downloaded from QAIHub are tied to a specific QNN SDK version.
 Before executing the scripts and runner, please ensure that you are using the QNN SDK version that is matching the context binary. Please refer to [Check context binary version](#check-context-binary-version) for tutorial on how to check the QNN Version for a context binary.
diff --git a/examples/qualcomm/oss_scripts/conv_former.py b/examples/qualcomm/oss_scripts/conv_former.py
new file mode 100644
index 0000000000..76131d659d
--- /dev/null
+++ b/examples/qualcomm/oss_scripts/conv_former.py
@@ -0,0 +1,139 @@
+# Copyright (c) Qualcomm Innovation Center, Inc.
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import json
+import os
+import sys
+from multiprocessing.connection import Client
+
+import numpy as np
+import timm
+import torch
+from executorch.backends.qualcomm.quantizer.quantizer import QuantDtype
+from executorch.backends.qualcomm.utils.constants import (
+    QCOM_PASS_EXPAND_BROADCAST_SHAPE,
+)
+from executorch.examples.qualcomm.utils import (
+    build_executorch_binary,
+    get_imagenet_dataset,
+    make_output_dir,
+    parse_skip_delegation_node,
+    setup_common_args_and_variables,
+    SimpleADB,
+    topk_accuracy,
+)
+
+
+def main(args):
+    skip_node_id_set, skip_node_op_set = parse_skip_delegation_node(args)
+
+    # ensure the working directory exist.
+    os.makedirs(args.artifact, exist_ok=True)
+
+    if not args.compile_only and args.device is None:
+        raise RuntimeError(
+            "device serial is required if not compile only. "
+            "Please specify a device serial by -s/--device argument."
+        )
+
+    data_num = 100
+    if args.compile_only:
+        inputs = [(torch.rand(1, 3, 224, 224),)]
+    else:
+        inputs, targets, input_list = get_imagenet_dataset(
+            dataset_path=f"{args.dataset}",
+            data_size=data_num,
+            image_shape=(256, 256),
+            crop_size=224,
+        )
+
+    pte_filename = "conv_former"
+    model = timm.create_model("convformer_s18.sail_in1k", pretrained=True)
+
+    model = model.eval()
+
+    build_executorch_binary(
+        model,
+        inputs[0],
+        args.model,
+        f"{args.artifact}/{pte_filename}",
+        inputs,
+        quant_dtype=QuantDtype.use_8a8w,
+        custom_pass_config={QCOM_PASS_EXPAND_BROADCAST_SHAPE},
+    )
+
+    if args.compile_only:
+        sys.exit(0)
+
+    adb = SimpleADB(
+        qnn_sdk=os.getenv("QNN_SDK_ROOT"),
+        build_path=f"{args.build_folder}",
+        pte_path=f"{args.artifact}/{pte_filename}.pte",
+        workspace=f"/data/local/tmp/executorch/{pte_filename}",
+        device_id=args.device,
+        host_id=args.host,
+        soc_model=args.model,
+        shared_buffer=args.shared_buffer,
+    )
+    adb.push(inputs=inputs, input_list=input_list)
+    adb.execute()
+
+    # collect output data
+    output_data_folder = f"{args.artifact}/outputs"
+    make_output_dir(output_data_folder)
+
+    adb.pull(output_path=args.artifact)
+
+    # top-k analysis
+    predictions = []
+    for i in range(data_num):
+        predictions.append(
+            np.fromfile(
+                os.path.join(output_data_folder, f"output_{i}_0.raw"), dtype=np.float32
+            )
+        )
+
+    k_val = [1, 5]
+    topk = [topk_accuracy(predictions, targets, k).item() for k in k_val]
+    if args.ip and args.port != -1:
+        with Client((args.ip, args.port)) as conn:
+            conn.send(json.dumps({f"top_{k}": topk[i] for i, k in enumerate(k_val)}))
+    else:
+        for i, k in enumerate(k_val):
+            print(f"top_{k}->{topk[i]}%")
+
+
+if __name__ == "__main__":
+    parser = setup_common_args_and_variables()
+    parser.add_argument(
+        "-a",
+        "--artifact",
+        help="path for storing generated artifacts by this example. Default ./conv_former",
+        default="./conv_former",
+        type=str,
+    )
+
+    parser.add_argument(
+        "-d",
+        "--dataset",
+        help=(
+            "path to the validation folder of ImageNet dataset. "
+            "e.g. --dataset imagenet-mini/val "
+            "for https://www.kaggle.com/datasets/ifigotin/imagenetmini-1000)"
+        ),
+        type=str,
+        required=True,
+    )
+
+    args = parser.parse_args()
+    try:
+        main(args)
+    except Exception as e:
+        if args.ip and args.port != -1:
+            with Client((args.ip, args.port)) as conn:
+                conn.send(json.dumps({"Error": str(e)}))
+        else:
+            raise Exception(e)
diff --git a/examples/qualcomm/oss_scripts/fastvit.py b/examples/qualcomm/oss_scripts/fastvit.py
index 0e2c695ab3..f0d2f4c3f0 100644
--- a/examples/qualcomm/oss_scripts/fastvit.py
+++ b/examples/qualcomm/oss_scripts/fastvit.py
@@ -10,6 +10,9 @@
 
 import numpy as np
 import torch
+from executorch.backends.qualcomm._passes.expand_broadcast_tensor_shape import (
+    ExpandBroadcastTensorShape,
+)
 from executorch.backends.qualcomm.quantizer.annotators import (
     QuantizationConfig,
     QuantizationSpec,
@@ -23,10 +26,11 @@
 )
 
 from executorch.backends.qualcomm.quantizer.quantizer import QuantDtype
-from executorch.backends.qualcomm.utils.constants import (
-    QCOM_PASS_EXPAND_BROADCAST_SHAPE,
+from executorch.backends.qualcomm.utils.constants import QCOM_PASS_ACTIVATE_KEY
+from executorch.backends.qualcomm.utils.utils import (
+    convert_linear_to_conv2d,
+    get_capture_program_passes,
 )
-from executorch.backends.qualcomm.utils.utils import convert_linear_to_conv2d
 from executorch.examples.qualcomm.utils import (
     build_executorch_binary,
     get_imagenet_dataset,
@@ -111,6 +115,8 @@ def main(args):
         bias=q_config.bias,
     )
     # lower to QNN
+    passes_job = get_capture_program_passes()
+    passes_job[ExpandBroadcastTensorShape][QCOM_PASS_ACTIVATE_KEY] = True
     build_executorch_binary(
         convert_linear_to_conv2d(get_instance(args.oss_repo, args.pretrained_weight)),
         inputs[0],
@@ -121,7 +127,7 @@ def main(args):
         skip_node_op_set=skip_node_op_set,
         quant_dtype=QuantDtype.use_8a8w,
         custom_quantizer=quantizer,
-        custom_pass_config={QCOM_PASS_EXPAND_BROADCAST_SHAPE},
+        passes_job=passes_job,
         shared_buffer=args.shared_buffer,
     )
 
diff --git a/examples/qualcomm/oss_scripts/llama3_2/CMakeLists.txt b/examples/qualcomm/oss_scripts/llama/CMakeLists.txt
similarity index 60%
rename from examples/qualcomm/oss_scripts/llama3_2/CMakeLists.txt
rename to examples/qualcomm/oss_scripts/llama/CMakeLists.txt
index 93b35a697c..4059ae7151 100644
--- a/examples/qualcomm/oss_scripts/llama3_2/CMakeLists.txt
+++ b/examples/qualcomm/oss_scripts/llama/CMakeLists.txt
@@ -18,38 +18,37 @@ target_link_libraries(
 )
 target_link_options_shared_lib(custom_ops)
 
-# preprocess qnn runner src files for llama3.2
-set(_llama3_2_runner__srcs ${_llama_runner__srcs})
-list(TRANSFORM _llama3_2_runner__srcs PREPEND "${EXECUTORCH_SOURCE_DIR}/")
-list(FILTER _llama3_2_runner__srcs EXCLUDE REGEX ".*(/runner/).*")
+# preprocess qnn runner src files for llama
+set(_llama_runner__srcs ${_llama_runner__srcs})
+list(TRANSFORM _llama_runner__srcs PREPEND "${EXECUTORCH_SOURCE_DIR}/")
+list(FILTER _llama_runner__srcs EXCLUDE REGEX ".*(/runner/).*")
 list(
   PREPEND
-  _llama3_2_runner__srcs
-  ${CMAKE_CURRENT_LIST_DIR}/qnn_llama3_2_runner.cpp
+  _llama_runner__srcs
+  ${CMAKE_CURRENT_LIST_DIR}/qnn_llama_runner.cpp
   ${CMAKE_CURRENT_LIST_DIR}/runner/runner.cpp
   ${CMAKE_CURRENT_LIST_DIR}/runner/runner.h
-  ${CMAKE_CURRENT_LIST_DIR}/runner/io_memory.cpp
-  ${CMAKE_CURRENT_LIST_DIR}/runner/io_memory.h
+  ${CMAKE_CURRENT_LIST_DIR}/runner/io_manager.cpp
+  ${CMAKE_CURRENT_LIST_DIR}/runner/io_manager.h
 )
 
-list(
-  APPEND _llama3_2_runner__srcs
-  ${CMAKE_CURRENT_SOURCE_DIR}/../../../../extension/llm/tokenizer/tiktoken.cpp
-)
 list(
   APPEND
-  _llama3_2_runner__srcs
+  _llama_runner__srcs
+  ${CMAKE_CURRENT_SOURCE_DIR}/../../../../extension/llm/tokenizer/tiktoken.cpp
   ${CMAKE_CURRENT_SOURCE_DIR}/../../../models/llama/tokenizer/llama_tiktoken.cpp
 )
 
-# build qnn llama3.2 1b runner
-add_executable(qnn_llama3_2_runner ${_llama3_2_runner__srcs})
+# build qnn llama runner
+add_executable(qnn_llama_runner ${_llama_runner__srcs})
 target_include_directories(
-  qnn_llama3_2_runner PUBLIC ${_common_include_directories}
+  qnn_llama_runner PUBLIC ${_common_include_directories}
 )
 
+target_link_options_shared_lib(quantized_ops_lib)
+
 target_link_libraries(
-  qnn_llama3_2_runner
+  qnn_llama_runner
   qnn_executorch_backend
   executorch_core
   extension_data_loader
@@ -58,10 +57,12 @@ target_link_libraries(
   gflags
   re2::re2
   custom_ops
+  quantized_ops_lib
+  quantized_kernels
 )
 target_compile_options(
-  qnn_llama3_2_runner PUBLIC ${_common_compile_options}
+  qnn_llama_runner PUBLIC ${_common_compile_options}
 )
 set_target_properties(
-  qnn_llama3_2_runner PROPERTIES LINK_FLAGS "-Wl,-rpath='$ORIGIN'"
+  qnn_llama_runner PROPERTIES LINK_FLAGS "-Wl,-rpath='$ORIGIN'"
 )
diff --git a/examples/qualcomm/oss_scripts/llama/README.md b/examples/qualcomm/oss_scripts/llama/README.md
new file mode 100644
index 0000000000..79c20180d6
--- /dev/null
+++ b/examples/qualcomm/oss_scripts/llama/README.md
@@ -0,0 +1,70 @@
+# Summary
+
+## Overview
+This file provides you the instructions to run LLAMA model with different parameters via Qualcomm HTP backend. We currently support the following models:
+ 1. LLAMA2 Stories 110M
+ 2. LLAMA3.2 1B
+ 3. LLAMA3.2 3B (WIP)
+We offer the following modes to execute the model:
+
+Prefill Mode: This is also known as batch prefill mode, where the model takes in a list of tokens as input and generates the next token along with the key-value (KV) cache for all tokens. This mode is efficient for generating the initial sequence of tokens (usually the user's prompt).
+
+KV Cache Mode: In KV Cache mode, the model takes in a single previous token and generates the next predicted token along with its KV cache. It is efficient for generating subsequent tokens after the initial prompt.
+
+Hybrid Mode: Hybrid mode leverages the strengths of both batch prefill and KV cache modes to optimize token generation speed. Initially, it uses prefill mode to efficiently generate the prompt's key-value (KV) cache. Then, the mode switches to KV cache mode, which excels at generating subsequent tokens.
+
+
+## Instructions
+### Note
+1. For hybrid mode, the export time will be longer and can take up to 1-4 hours to complete, depending on the specific model users are exporting.
+2. When exporting a hybrid mode model, memory consumption will be higher. Taking LLAMA3.2 1B as an example, please ensure the device has at least 80 GB of memory and swap space.
+
+
+### Step 1: Setup
+1. Follow the [tutorial](https://pytorch.org/executorch/main/getting-started-setup) to set up ExecuTorch.
+2. Follow the [tutorial](https://pytorch.org/executorch/stable/build-run-qualcomm-ai-engine-direct-backend.html) to build Qualcomm AI Engine Direct Backend.
+
+### Step 2: Prepare Model
+
+#### LLAMA2
+Download and prepare stories110M model
+
+```bash
+# tokenizer.model & stories110M.pt:
+wget "https://huggingface.co/karpathy/tinyllamas/resolve/main/stories110M.pt"
+wget "https://raw.githubusercontent.com/karpathy/llama2.c/master/tokenizer.model"
+
+# tokenizer.bin:
+python -m extension.llm.tokenizer.tokenizer -t tokenizer.model -o tokenizer.bin
+
+# params.json:
+echo '{"dim": 768, "multiple_of": 32, "n_heads": 12, "n_layers": 12, "norm_eps": 1e-05, "vocab_size": 32000}' > params.json
+```
+
+#### LLAMA3.2 
+Follow the [instructions](https://www.llama.com/) to download models.
+At the end of this step, users should have the following files ready: `consolidated.00.pth`, `params.json`, and `tokenizer.model`.
+
+
+### Step3: Run default examples using hybrid mode.
+#### LLAMA2
+```bash
+python examples/qualcomm/oss_scripts/llama/llama.py -b build-android -s ${SERIAL_NUM} -m ${SOC_MODEL} --ptq 16a4w --checkpoint stories110M.pt --params params.json --tokenizer_model tokenizer.model --tokenizer_bin tokenizer.bin --llama_model stories110m --model_mode hybrid --prefill_seq_len 32 --kv_seq_len 128 --prompt "Once upon a time"
+```
+
+#### LLAMA3.2
+Default example using hybrid mode.
+```bash
+python examples/qualcomm/oss_scripts/llama/llama.py -b build-android -s ${SERIAL_NUM} -m ${SOC_MODEL} --ptq 16a4w --checkpoint consolidated.00.pth --params params.json --tokenizer_model tokenizer.model --llama_model llama3_2 --model_mode hybrid --prefill_seq_len 32 --kv_seq_len 128 --prompt "what is 1+1"
+```
+
+### Additional Configs when running the script
+If you would like to compile the model only, we have provided the flag `--compile_only`. Taking LLAMA3.2 as an example:
+```bash
+python examples/qualcomm/oss_scripts/llama/llama.py -b build-android -m ${SOC_MODEL} --ptq 16a4w --checkpoint consolidated.00.pth --params params.json --tokenizer_model tokenizer.model --llama_model llama3_2 --model_mode hybrid --prefill_seq_len 32 --kv_seq_len 128 --prompt "what is 1+1" --compile_only
+```
+
+On the other hand, if you already have a pre-compiled .pte model, you can perform inference by providing the flag `--pre_gen_pte` and specifying the folder that contains the .pte model. Taking LLAMA3.2 as an example:
+```bash
+python examples/qualcomm/oss_scripts/llama/llama.py -b build-android -s ${SERIAL_NUM} -m ${SOC_MODEL} --ptq 16a4w --checkpoint consolidated.00.pth --params params.json --tokenizer_model tokenizer.model --llama_model llama3_2 --model_mode hybrid --prefill_seq_len 32 --kv_seq_len 128 --prompt "what is 1+1" --pre_gen_pte ${FOLDER_TO_PRE_GEN_PTE}
+```
\ No newline at end of file
diff --git a/examples/qualcomm/oss_scripts/llama2/TARGETS b/examples/qualcomm/oss_scripts/llama/TARGETS
similarity index 57%
rename from examples/qualcomm/oss_scripts/llama2/TARGETS
rename to examples/qualcomm/oss_scripts/llama/TARGETS
index b0f5ea7f64..419316acf0 100644
--- a/examples/qualcomm/oss_scripts/llama2/TARGETS
+++ b/examples/qualcomm/oss_scripts/llama/TARGETS
@@ -5,7 +5,6 @@ load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
 
 oncall("executorch")
 
-
 python_library(
     name = "static_llama",
     srcs = [
@@ -16,12 +15,33 @@ python_library(
     ],
 )
 
+python_library(
+    name = "llama_lib",
+    srcs = ["llama.py"],
+    deps = [
+        "//caffe2:torch",
+        "//executorch/backends/qualcomm/partition:partition",
+        "//executorch/backends/qualcomm/quantizer:quantizer",
+        "//executorch/devtools:lib",
+        "//executorch/examples/models:models",
+        "//executorch/examples/qualcomm/oss_scripts/llama:static_llama",
+        "//executorch/examples/qualcomm:utils",
+        "//executorch/extension/export_util:export_util",
+        "//executorch/extension/llm/custom_ops:model_sharding_py",
+        "//executorch/extension/llm/export:export_lib",
+        "//executorch/extension/pybindings:aten_lib",
+    ],
+)
+
 python_binary(
     name = "llama",
     srcs = ["llama.py"],
-    main_function = "executorch.examples.qualcomm.oss_scripts.llama2.llama.main",
+    main_function = "executorch.examples.qualcomm.oss_scripts.llama.llama.main",
+    preload_deps = [
+        "//executorch/extension/llm/custom_ops:model_sharding_py",
+    ],
     deps = [
-        ":static_llama",
+        "//executorch/examples/qualcomm/oss_scripts/llama:static_llama",
         "//caffe2:torch",
         "//executorch/extension/pybindings:aten_lib",
         "//executorch/backends/qualcomm/partition:partition",
@@ -38,6 +58,8 @@ runtime.command_alias(
     name = "llama_qnn",
     env = {
         "LD_LIBRARY_PATH": "$(location fbsource//third-party/qualcomm/qnn/qnn-{0}:qnn_offline_compile_libs)".format(get_qnn_library_verision()),
+        # Place holder to pass the QNN_SDK_ROOT check in executorch/examples/qualcomm/utils.py
+        "QNN_SDK_ROOT": "",
     },
     exe = ":llama",
 )
diff --git a/examples/qualcomm/oss_scripts/llama3_2/llama.py b/examples/qualcomm/oss_scripts/llama/llama.py
similarity index 65%
rename from examples/qualcomm/oss_scripts/llama3_2/llama.py
rename to examples/qualcomm/oss_scripts/llama/llama.py
index a18690e941..e575a3f5c4 100755
--- a/examples/qualcomm/oss_scripts/llama3_2/llama.py
+++ b/examples/qualcomm/oss_scripts/llama/llama.py
@@ -14,20 +14,32 @@
 import os
 import sys
 import time
+from collections import OrderedDict
 from functools import partial
 from multiprocessing.connection import Client
 
 import torch
+from executorch.backends.qualcomm._passes.i64_to_i32 import I64toI32
 
 from executorch.backends.qualcomm.partition.qnn_partitioner import QnnPartitioner
 
 from executorch.backends.qualcomm.quantizer.custom_annotation import (
+    annotate_linear_16a8w_in_affine_layer,
     annotate_matmul_16a8w,
+    annotate_prefill_kv_output,
 )
 
 from executorch.backends.qualcomm.quantizer.quantizer import QuantDtype
 from executorch.backends.qualcomm.serialization.qc_schema import QcomChipset
-from executorch.backends.qualcomm.utils.constants import QCOM_QUANTIZED_IO
+
+from executorch.backends.qualcomm.serialization.qc_schema_serialize import (
+    flatbuffer_to_option,
+    option_to_flatbuffer,
+)
+from executorch.backends.qualcomm.utils.constants import (
+    QCOM_PASS_ARGS_KWARGS_DEFAULTS_KEY,
+    QCOM_QUANTIZED_IO,
+)
 from executorch.backends.qualcomm.utils.utils import (
     capture_program,
     convert_linear_to_conv2d,
@@ -35,10 +47,15 @@
     generate_htp_compiler_spec,
     generate_multi_graph_program,
     generate_qnn_executorch_compiler_spec,
+    get_capture_program_passes,
     get_soc_to_chipset_map,
     update_spill_fill_size,
 )
-from executorch.examples.qualcomm.oss_scripts.llama2.model.static_llama import (
+from executorch.examples.models.llama.source_transformation.quantize import (
+    get_quant_embedding_transform,
+)
+from executorch.examples.models.llama.tokenizer.tiktoken import Tokenizer as Tiktoken
+from executorch.examples.qualcomm.oss_scripts.llama.model.static_llama import (
     LlamaModel,
     ModelArgs,
 )
@@ -55,6 +72,9 @@
 from executorch.exir.passes.memory_planning_pass import MemoryPlanningPass
 from executorch.extension.llm.custom_ops import model_sharding
 from executorch.extension.llm.export.builder import DType
+from executorch.extension.llm.tokenizer.tokenizer import (
+    Tokenizer as SentencePieceTokenizer,
+)
 from executorch.extension.llm.tokenizer.utils import get_tokenizer
 
 from torch.ao.quantization.observer import MinMaxObserver
@@ -66,74 +86,116 @@
 logging.getLogger().setLevel(logging.INFO)
 
 
+def smart_mask_updator(atten_mask, pos, k_caches, v_caches, new_k_caches, new_v_caches):
+    for i, k_cache in enumerate(k_caches):
+        k_cache[:, :, pos] = new_k_caches[i][:, :, 0]
+
+    for i, v_cache in enumerate(v_caches):
+        v_cache[:, pos, :] = new_v_caches[i]
+
+    atten_mask[0][pos] = 0
+    pos += 1
+    return (atten_mask, pos, k_caches, v_caches)
+
+
+def shift_pointer_updator(
+    atten_mask, pos, k_caches, v_caches, new_k_caches, new_v_caches
+):
+    k_caches = [
+        torch.cat([k_cache[:, :, 1:], new_k_caches[i]], dim=-1)
+        for i, k_cache in enumerate(k_caches)
+    ]
+    v_caches = [
+        torch.cat([v_cache[:, 1:, :], new_v_caches[i]], dim=1)
+        for i, v_cache in enumerate(v_caches)
+    ]
+
+    pos += 1
+    atten_mask[0][-pos - 1] = 0
+    return (atten_mask, pos, k_caches, v_caches)
+
+
 def _kv_calibrate(
     example_inputs,
     user_prompts,
     module: torch.fx.GraphModule,
-    tokenizer_model_path="tokenizer.model",
+    tokenizer,
     max_seq_len=512,
+    updator=smart_mask_updator,
+    use_i64_token=False,
 ):
-    sp_model = get_tokenizer(tokenizer_model_path)
     _, atten_mask, _, k_caches, v_caches = example_inputs
 
     # TODO: change criteria & support batch inputs if necessary
     pos = torch.tensor(0, dtype=torch.int32)
     max_cache_len = max_seq_len - 1
-    token_list = sp_model.encode(
-        user_prompts, bos=True, eos=False, allowed_special="all"
-    )
+
+    token_list = []
+    # Llama2 tokenizer has no special tokens
+    if isinstance(tokenizer, SentencePieceTokenizer):
+        token_list = tokenizer.encode(user_prompts, bos=True, eos=False)
+    elif isinstance(tokenizer, Tiktoken):
+        token_list = tokenizer.encode(
+            user_prompts, bos=True, eos=False, allowed_special="all"
+        )
+    else:
+        raise RuntimeError("Unkown tokenizer")
 
     with torch.no_grad():
-        while token_list[-1] != sp_model.eos_id and pos < max_cache_len:
+        while token_list[-1] != tokenizer.eos_id and pos < max_cache_len:
+            dtype = torch.int64 if use_i64_token else torch.int32
+            token = torch.full((1, 1), token_list[pos], dtype=dtype)
             logits, new_k_caches, new_v_caches = module(
-                torch.full((1, 1), token_list[pos], dtype=torch.int32),
+                token,
                 atten_mask,
                 torch.full((1, 1), pos),
                 *k_caches,
                 *v_caches,
             )
-            k_caches = [
-                torch.cat([k_cache[:, :, 1:], new_k_caches[i]], dim=-1)
-                for i, k_cache in enumerate(k_caches)
-            ]
-            v_caches = [
-                torch.cat([v_cache[:, 1:, :], new_v_caches[i]], dim=1)
-                for i, v_cache in enumerate(v_caches)
-            ]
-
-            pos += 1
-            atten_mask[0][-pos - 1] = 0
+            atten_mask, pos, k_caches, v_caches = updator(
+                atten_mask, pos, k_caches, v_caches, new_k_caches, new_v_caches
+            )
             if pos >= len(token_list):
                 token_list.append(torch.argmax(logits[:, -1], dim=-1).item())
 
-    print(f"calibration data:\n{sp_model.decode(token_list)}")
+    print(f"kv calibration data:\n{tokenizer.decode(token_list)}")
 
 
 def _prefill_calibrate(
     example_inputs,
     user_prompts,
     module: torch.fx.GraphModule,
-    tokenizer_model_path="tokenizer.model",
+    tokenizer,
     max_seq_len=512,
+    use_i64_token=False,
 ):
-    sp_model = get_tokenizer(tokenizer_model_path)
     _, atten_mask = example_inputs
     max_cache_len = max_seq_len - 1
 
     # TODO: change criteria & support batch inputs if necessary
-    token_list = sp_model.encode(
-        user_prompts, bos=True, eos=False, allowed_special="all"
-    )
+
+    token_list = []
+    # Llama2 tokenizer has no special tokens
+    if isinstance(tokenizer, SentencePieceTokenizer):
+        token_list = tokenizer.encode(user_prompts, bos=True, eos=False)
+    elif isinstance(tokenizer, Tiktoken):
+        token_list = tokenizer.encode(
+            user_prompts, bos=True, eos=False, allowed_special="all"
+        )
+    else:
+        raise RuntimeError("Unkown tokenizer")
+
     pos = len(token_list)
+    dtype = torch.int64 if use_i64_token else torch.int32
 
     with torch.no_grad():
-        while token_list[-1] != sp_model.eos_id and pos < max_cache_len:
-            tmp_token_list = torch.tensor(token_list).reshape(1, -1)
+        while token_list[-1] != tokenizer.eos_id and pos < max_cache_len:
+            tmp_token_list = torch.tensor(token_list, dtype=dtype).reshape(1, -1)
             if pos < max_cache_len:
                 tmp_token_list = torch.cat(
                     [
                         tmp_token_list,
-                        torch.zeros((1, max_cache_len - pos), dtype=torch.int32),
+                        torch.zeros((1, max_cache_len - pos), dtype=dtype),
                     ],
                     dim=1,
                 )
@@ -144,31 +206,36 @@ def _prefill_calibrate(
             token_list.append(torch.argmax(logits[:, pos - 1], dim=-1).item())
             pos += 1
 
-    print(f"calibration data:\n{sp_model.decode(token_list)}")
+    print(f"prefill calibration data:\n{tokenizer.decode(token_list)}")
 
 
 def calibrate(
     example_inputs,
     user_prompts,
     module: torch.fx.GraphModule,
-    tokenizer_model_path="tokenizer.model",
+    tokenizer,
     max_seq_len=512,
+    kv_updator=smart_mask_updator,
+    use_i64_token=False,
 ):
     if len(example_inputs) == 2:
         _prefill_calibrate(
             example_inputs,
             user_prompts,
             module,
-            tokenizer_model_path,
+            tokenizer,
             max_seq_len,
+            use_i64_token,
         )
     elif len(example_inputs) == 5:
         _kv_calibrate(
             example_inputs,
             user_prompts,
             module,
-            tokenizer_model_path,
+            tokenizer,
             max_seq_len,
+            updator=kv_updator,
+            use_i64_token=use_i64_token,
         )
     else:
         raise RuntimeError("Get wrong inputs")
@@ -190,6 +257,7 @@ def __init__(self, llama_model, pte_filename) -> None:
         else:
             tokens, atten_mask = self.get_example_inputs(use_kv_cache=False)
             self.inputs = (tokens, atten_mask)
+        self.llama_graph_module = llama_model
 
     def _tag_ios(self, gm: torch.fx.GraphModule, fixed_point_type):
         if not self.has_quant_io:
@@ -280,7 +348,7 @@ def _tag_ios(self, gm: torch.fx.GraphModule, fixed_point_type):
 
         return quant_attrs
 
-    def quantize(self, quant_dtype, args, custom_annotations=()):
+    def quantize(self, quant_dtype, args, tokenizer, custom_annotations=()):
         self.quant_dtype = quant_dtype
         quantizer = make_quantizer(
             quant_dtype=quant_dtype,
@@ -295,19 +363,22 @@ def quantize(self, quant_dtype, args, custom_annotations=()):
 
         with torch.no_grad():
             fx_graph_module = torch.export.export(
-                self.llama_model, self.inputs, strict=True
+                self.llama_graph_module, self.inputs, strict=True
             ).module()
             fx_graph_module = prepare_pt2e(fx_graph_module, quantizer)
+
         logging.info("Quantizing the model...")
         calibrate(
             self.get_example_inputs(self.llama_meta["get_use_kv_cache"]),
             args.prompt,
             fx_graph_module,
-            tokenizer_model_path=args.tokenizer_model,
+            tokenizer=tokenizer,
             max_seq_len=self.llama_meta["get_max_seq_len"],
+            kv_updator=args.kv_updator,
+            use_i64_token=args.embedding_quantize is not None,
         )
 
-        self.llama_model = convert_pt2e(fx_graph_module)
+        self.llama_graph_module = convert_pt2e(fx_graph_module)
 
     def lowering_modules(
         self,
@@ -315,7 +386,9 @@ def lowering_modules(
         fixed_point_type,
         use_fp16=False,
         soc_model=QcomChipset.SM8650,
-        num_sharding=0,
+        num_sharding=1,
+        passes_job=OrderedDict(),
+        shared_buffer=False,
     ):
         executorch_config = ExecutorchBackendConfig(
             # For shared buffer, user must pass the memory address
@@ -331,22 +404,24 @@ def lowering_modules(
         with torch.no_grad():
             # backend option
             backend_options = generate_htp_compiler_spec(
-                use_fp16=use_fp16, use_multi_contexts=num_sharding > 0
+                use_fp16=use_fp16, use_multi_contexts=num_sharding > 1
             )
             compiler_specs = generate_qnn_executorch_compiler_spec(
                 soc_model=soc_model,
                 backend_options=backend_options,
-                shared_buffer=False,
+                shared_buffer=shared_buffer,
             )
             skip_node_op_set = {"llama.fallback.default"}
             partitioner = QnnPartitioner(
                 compiler_specs, skip_node_op_set=skip_node_op_set
             )
             edge_prog = capture_program(
-                self.llama_model, self.inputs, custom_pass_config=frozenset()
+                self.llama_graph_module,
+                self.inputs,
+                passes_job,
             )
 
-            if num_sharding > 0:
+            if num_sharding > 1:
                 model_sharding.split_graph(
                     edge_prog.exported_program,
                     self.llama_meta["get_n_layers"],
@@ -363,10 +438,10 @@ def lowering_modules(
                 compile_config=EdgeCompileConfig(_check_ir_validity=False),
             )
             edge_prog_mgr = edge_prog_mgr.to_backend(partitioner)
-            if num_sharding > 0:
+            if num_sharding > 1:
                 update_spill_fill_size(edge_prog_mgr.exported_program())
             exec_prog_mgr = edge_prog_mgr.to_executorch(config=executorch_config)
-            with open(f"{work_space}/{pte_filename}.pte", "wb") as file:
+            with open(f"{work_space}/{self.pte_filename}.pte", "wb") as file:
                 exec_prog_mgr.write_to_file(file)
 
     def get_example_inputs(self, use_kv_cache=True):
@@ -376,7 +451,7 @@ def get_quant_attrs(self):
         return self.quant_attrs
 
 
-def compile(args, pte_filename):
+def compile(args, pte_filename, tokenizer):
     os.makedirs(args.artifact, exist_ok=True)
     start_ts = time.time()
 
@@ -396,24 +471,37 @@ def compile(args, pte_filename):
     )
 
     llama_instance_list = []
+    use_i64_token = args.embedding_quantize is not None
     with torch.device("meta"):
         if args.model_mode == "kv":
             llama_instance_list.append(
-                LlamaModel(kv_config, output_new_cache_only=True)
+                LlamaModel(
+                    kv_config, output_new_cache_only=True, use_i64_token=use_i64_token
+                )
             )
         elif args.model_mode == "prefill":
             llama_instance_list.append(
-                LlamaModel(prefill_config, output_new_cache_only=False)
+                LlamaModel(
+                    prefill_config,
+                    output_new_cache_only=False,
+                    use_i64_token=use_i64_token,
+                )
             )
         elif args.model_mode == "hybrid":
             llama_instance_list.append(
-                LlamaModel(prefill_config, output_new_cache_only=False)
+                LlamaModel(
+                    kv_config, output_new_cache_only=True, use_i64_token=use_i64_token
+                )
             )
             llama_instance_list.append(
-                LlamaModel(kv_config, output_new_cache_only=True)
+                LlamaModel(
+                    prefill_config,
+                    output_new_cache_only=False,
+                    use_i64_token=use_i64_token,
+                )
             )
         else:
-            raise RuntimeError(f"No such model_mode {args.model_mode}.")
+            raise RuntimeError(f"Unknown model_mode: {args.model_mode}.")
 
     if "model" in state_dict:
         state_dict = state_dict["model"]
@@ -452,6 +540,7 @@ def compile(args, pte_filename):
 
     assert args.tokenizer_model is not None, "Need tokenizer model for calibration"
 
+    passes_job = get_capture_program_passes()
     if args.dtype_override is not None:
         dtype_override = DType[args.dtype_override]
         for i in range(len(llama_instance_list)):
@@ -460,6 +549,13 @@ def compile(args, pte_filename):
             )
 
     for i in range(len(llama_instance_list)):
+        if args.embedding_quantize:
+            llama_instance_list[i] = get_quant_embedding_transform(args)(
+                llama_instance_list[i]
+            )
+            passes_job[I64toI32][QCOM_PASS_ARGS_KWARGS_DEFAULTS_KEY]["skip_node"] = {
+                "tokens"
+            }
         llama_instance_list[i] = convert_linear_to_conv2d(llama_instance_list[i])
         llama_instance_list[i] = SingleLlama(
             llama_instance_list[i].eval(), pte_filename
@@ -467,44 +563,68 @@ def compile(args, pte_filename):
 
     if args.ptq:
         start_quantize_ts = time.time()
-        for llama_instance in llama_instance_list:
-            llama_instance.quantize(
-                quant_dtype=quant_dtype,
-                args=args,
-                custom_annotations=(
-                    partial(
-                        annotate_matmul_16a8w,
-                        traverse_input1=llama_instance.llama_meta["get_use_kv_cache"],
-                    ),
-                ),
+        custom_annotations = (annotate_matmul_16a8w,)
+        if args.llama_model == "stories110m":
+            custom_annotations = custom_annotations + (
+                annotate_linear_16a8w_in_affine_layer,
             )
+        if args.ptq != None:
+            kv_quant_attrs = {}
+            for i, llama_instance in enumerate(llama_instance_list):
+                llama_instance.quantize(
+                    quant_dtype=quant_dtype,
+                    args=args,
+                    tokenizer=tokenizer,
+                    custom_annotations=custom_annotations,
+                )
+                # If hybrid mode, we store kv output quant_attrs and apply to prefill output quant_attrs later
+                if i == 0 and args.model_mode == "hybrid":
+                    output_indices = 0
+                    for node in llama_instance.llama_graph_module.graph.nodes:
+                        if node.op == "output":
+                            for output in node.args[0]:
+                                kv_quant_attrs[output_indices] = output.args[1:]
+                                output_indices += 1
+                            break
+                    custom_annotations = custom_annotations + (
+                        partial(
+                            annotate_prefill_kv_output,
+                            kv_quant_attrs=kv_quant_attrs,
+                        ),
+                    )
         end_quantize_ts = time.time()
         logging.info(f"Time for quantizing: {end_quantize_ts - start_quantize_ts}")
 
     start_lowering_ts = time.time()
     quant_attrs = None
 
-    if len(llama_instance_list) == 1:
+    if args.model_mode in ["kv", "prefill"]:
         llama_instance_list[0].lowering_modules(
             args.artifact,
             fixed_point_type,
             use_fp16=use_fp16,
             soc_model=get_soc_to_chipset_map()[args.model],
             num_sharding=args.num_sharding,
+            passes_job=passes_job,
+            shared_buffer=args.shared_buffer,
         )
         quant_attrs = llama_instance_list[0].get_quant_attrs()
-    else:
+    elif args.model_mode == "hybrid":
         sample_inputs_list = [
             llama_instace.inputs for llama_instace in llama_instance_list
         ]
         edge_progs = [
-            capture_program(llama_instance.llama_model, sample_input)
+            capture_program(
+                llama_instance.llama_graph_module,
+                sample_input,
+                passes_job=passes_job,
+            )
             for llama_instance, sample_input in zip(
                 llama_instance_list, sample_inputs_list
             )
         ]
 
-        if args.num_sharding > 0:
+        if args.num_sharding > 1:
             for i in range(len(llama_instance_list)):
                 model_sharding.split_graph(
                     edge_progs[i].exported_program,
@@ -518,14 +638,14 @@ def compile(args, pte_filename):
                 fixed_point_type,
             )
         backend_options = generate_htp_compiler_spec(
-            use_fp16=use_fp16, use_multi_contexts=args.num_sharding > 0
+            use_fp16=use_fp16, use_multi_contexts=args.num_sharding > 1
         )
-        graph_names = ["prefill_forward", "kv_forward"]
+        graph_names = ["kv_forward", "prefill_forward"]
         compiler_specs = [
             generate_qnn_executorch_compiler_spec(
                 soc_model=get_soc_to_chipset_map()[args.model],
                 backend_options=backend_options,
-                shared_buffer=True,
+                shared_buffer=args.shared_buffer,
                 multiple_graphs=True,
                 graph_name=graph_name,
             )
@@ -539,9 +659,13 @@ def compile(args, pte_filename):
             )
             for i, edge_prog in enumerate(edge_progs)
         ]
-        if args.num_sharding > 0:
-            for exported_program in exported_programs:
-                update_spill_fill_size(exported_program)
+        if args.num_sharding > 1:
+            max_sf_size = update_spill_fill_size(exported_programs)
+            qnn_executorch_options = flatbuffer_to_option(compiler_specs[0][0].value)
+            qnn_executorch_options.backend_options.htp_options.max_sf_buf_size = (
+                max_sf_size
+            )
+            compiler_specs[0][0].value = option_to_flatbuffer(qnn_executorch_options)
 
         executorch_config = ExecutorchBackendConfig(
             # For shared buffer, user must pass the memory address
@@ -555,6 +679,7 @@ def compile(args, pte_filename):
             extract_delegate_segments=True,
         )
 
+        bundle_progs_list = []
         lower_module_dict = {name: [] for name in graph_names}
         call_delegate_inputs_dict = {name: [] for name in graph_names}
         call_delegate_node_name_dict = {name: [] for name in graph_names}
@@ -570,11 +695,17 @@ def compile(args, pte_filename):
                     call_delegate_inputs_list = []
                     for arg in node.args:
                         if arg.op == "call_function":
-                            while "getitem" not in arg.name:
-                                arg = arg.args[0]
-                            call_delegate_inputs_list.append(
-                                (arg.args[0].name, arg.args[1])
-                            )
+                            if (
+                                arg.target
+                                == exir_ops.edge.quantized_decomposed.embedding_4bit.dtype
+                            ):
+                                call_delegate_inputs_list.append((arg.name, None))
+                            else:
+                                while "getitem" not in arg.name:
+                                    arg = arg.args[0]
+                                call_delegate_inputs_list.append(
+                                    (arg.args[0].name, arg.args[1])
+                                )
                         elif arg.op == "placeholder":
                             call_delegate_inputs_list.append((arg.name, None))
                         # No extra needs to do for get_attr node
@@ -584,95 +715,59 @@ def compile(args, pte_filename):
                 elif node.op == "output":
                     for arg in node.args[0]:
                         outputs_dict[graph_name].append((arg.args[0].name, arg.args[1]))
-
-        if args.num_sharding > 0:
-            bundle_progs_list = []
-            for num in range(args.num_sharding - 1, -1, -1):
-                processed_bytes = []
-                for prog, graph_name in zip(exported_programs, graph_names):
-                    processed_bytes.append(
-                        getattr(
-                            prog.graph_module, f"lowered_module_{num}"
-                        ).processed_bytes
-                    )
-
-                    call_delegate_node = [
-                        list(node.users.keys())[0]
-                        for node in prog.graph_module.graph.nodes
-                        if node.op == "get_attr"
-                        and node.name == f"lowered_module_{num}"
-                    ]
-                    input_nodes_dict[graph_name] = [
-                        node
-                        for node in call_delegate_node[0].args
-                        if node.op == "placeholder"
-                    ]
-
-                prog_mgr, bundle_progs = generate_multi_graph_program(
-                    compiler_specs=compiler_specs[0],
-                    processed_bytes=processed_bytes,
-                    input_nodes_dict=input_nodes_dict,
-                    backend_config=executorch_config,
-                    constant_methods=llama_instance_list[
-                        1
-                    ].llama_meta,  # kv method meta
-                )
-                bundle_progs_list.append(bundle_progs)
-                for graph_name in graph_names:
-                    lower_module_dict[graph_name].append(
-                        prog_mgr.exported_program(graph_name).graph_module._modules.get(
-                            "lowered_module_0"
-                        )
-                    )
-
-            exec_prog = generate_composite_llama_program(
-                graph_names=graph_names,
-                sample_inputs_list=sample_inputs_list,
-                lower_module_dict=lower_module_dict,
-                call_delegate_node_name_dict=call_delegate_node_name_dict,
-                call_delegate_inputs_dict=call_delegate_inputs_dict,
-                outputs_dict=outputs_dict,
-                backend_config=executorch_config,
-                constant_methods=llama_instance_list[1].llama_meta,  # kv method meta
-            )
-            with open(f"{args.artifact}/{pte_filename}.pte", "wb") as file:
-                exec_prog.write_to_file(file)
-        else:
+        for num in range(args.num_sharding - 1, -1, -1):
             processed_bytes = []
-            input_nodes_dict = {name: [] for name in graph_names}
-            output_nodes_dict = {name: [] for name in graph_names}
             for prog, graph_name in zip(exported_programs, graph_names):
                 processed_bytes.append(
-                    prog.graph_module.lowered_module_0.processed_bytes
+                    getattr(prog.graph_module, f"lowered_module_{num}").processed_bytes
                 )
-                input_nodes_dict[graph_name] = [
-                    node
+                call_delegate_node = [
+                    list(node.users.keys())[0]
                     for node in prog.graph_module.graph.nodes
-                    if node.op == "placeholder"
+                    if node.op == "get_attr" and node.name == f"lowered_module_{num}"
                 ]
-                output_nodes_dict[graph_name] = [
+                input_nodes_dict[graph_name] = [
                     node
-                    for node in prog.graph_module.graph.nodes
-                    if node.op == "output"
+                    for node in call_delegate_node[0].args
+                    if node.op == "placeholder"
+                    or node.target
+                    == exir_ops.edge.quantized_decomposed.embedding_4bit.dtype
                 ]
-
-            prog_mgr, _ = generate_multi_graph_program(
+            prog_mgr, bundle_progs = generate_multi_graph_program(
                 compiler_specs=compiler_specs[0],
                 processed_bytes=processed_bytes,
                 input_nodes_dict=input_nodes_dict,
-                output_nodes_dict=output_nodes_dict,
                 backend_config=executorch_config,
-                constant_methods=llama_instance_list[1].llama_meta,  # kv method meta
+                constant_methods=llama_instance_list[0].llama_meta,  # kv method meta
             )
-            with open(f"{args.artifact}/{pte_filename}.pte", "wb") as file:
-                prog_mgr.write_to_file(file)
+            bundle_progs_list.append(bundle_progs)
+            for graph_name in graph_names:
+                lower_module_dict[graph_name].append(
+                    prog_mgr.exported_program(graph_name).graph_module._modules.get(
+                        "lowered_module_0"
+                    )
+                )
+        exec_prog = generate_composite_llama_program(
+            llama_model=llama_instance_list[1].llama_model,
+            graph_names=graph_names,
+            sample_inputs_list=sample_inputs_list,
+            lower_module_dict=lower_module_dict,
+            call_delegate_node_name_dict=call_delegate_node_name_dict,
+            call_delegate_inputs_dict=call_delegate_inputs_dict,
+            outputs_dict=outputs_dict,
+            embedding_quantize=args.embedding_quantize,
+            backend_config=executorch_config,
+            constant_methods=llama_instance_list[1].llama_meta,  # kv method meta
+        )
+        with open(f"{args.artifact}/{pte_filename}.pte", "wb") as file:
+            exec_prog.write_to_file(file)
 
     end_lowering_ts = time.time()
     logging.info(f"Time for compiling: {end_lowering_ts - start_lowering_ts}")
     return quant_attrs
 
 
-def inference(args, quant_attrs, pte_filename, pre_gen_pte=""):
+def inference(args, quant_attrs, pte_filename, runtime_tokenizer_path, pre_gen_pte=""):
     workspace = f"/data/local/tmp/{getpass.getuser()}/executorch/single_llama"
 
     if args.model_mode == "prefill":
@@ -682,14 +777,14 @@ def inference(args, quant_attrs, pte_filename, pre_gen_pte=""):
     elif args.model_mode == "hybrid":
         eval_mode = 2
     else:
-        raise RuntimeError(f"No such model_mode {args.model_mode}.")
+        raise RuntimeError(f"Unknown model_mode: {args.model_mode}.")
 
     seq_len = args.prefill_seq_len if args.model_mode == "prefill" else args.kv_seq_len
     runner_args = " ".join(
         [
             f"--model_path {pte_filename}.pte",
             "--output_path outputs/outputs.txt",
-            f"--tokenizer_path {os.path.basename(args.tokenizer_model)}",
+            f"--tokenizer_path {os.path.basename(runtime_tokenizer_path)}",
             f'--prompt "{args.prompt}"',
             f"--seq_len {seq_len}",
             f"--eval_mode {eval_mode}",
@@ -697,12 +792,13 @@ def inference(args, quant_attrs, pte_filename, pre_gen_pte=""):
             f"--system_prompt '{args.system_prompt}'",
             f"--logits_scale {quant_attrs['scale']}",
             f"--logits_offset {quant_attrs['zero_point']}",
+            f"--kv_updator {'SmartMask' if args.kv_updator == smart_mask_updator else 'ShiftPointer'}",
         ]
     )
     runner_cmd = " ".join(
         [
             f"cd {workspace} &&",
-            f"./qnn_llama3_2_runner {runner_args}",
+            f"./qnn_llama_runner {runner_args}",
         ]
     )
 
@@ -720,10 +816,10 @@ def inference(args, quant_attrs, pte_filename, pre_gen_pte=""):
         host_id=args.host,
         soc_model=args.model,
         shared_buffer=args.shared_buffer,
-        runner=f"examples/qualcomm/oss_scripts/llama3_2/qnn_llama3_2_runner",
+        runner=f"examples/qualcomm/oss_scripts/llama/qnn_llama_runner",
     )
     # No pregen inputs, input_list is not required
-    adb.push(inputs=[], input_list="", files=[args.tokenizer_model])
+    adb.push(inputs=[], input_list="", files=[runtime_tokenizer_path])
     adb.execute(custom_runner_cmd=runner_cmd)
 
     # collect output data
@@ -751,13 +847,13 @@ def post_process():
             logging.info(f"Results[{idx}]:\n{output}")
 
 
-def main():
+def _build_parser():
     parser = setup_common_args_and_variables()
     parser.add_argument(
         "-a",
         "--artifact",
-        help="path for storing generated artifacts and output by this example. Default ./llama3_2_qnn",
-        default="./llama3_2_qnn",
+        help="path for storing generated artifacts and output by this example. Default ./llama_qnn",
+        default="./llama_qnn",
         type=str,
     )
 
@@ -768,6 +864,13 @@ def main():
         type=str,
     )
 
+    parser.add_argument(
+        "--llama_model",
+        choices=["stories110m", "llama3_2"],
+        help="The Llama model to export. Current available options are: [stories110m, llama3_2]",
+        required=True,
+    )
+
     parser.add_argument(
         "--checkpoint",
         help="Pass llama checkpoint.",
@@ -783,10 +886,9 @@ def main():
     )
 
     parser.add_argument(
-        "--model_size",
-        help="Determine what runner be used. For llama 3.2, we only support 1B/3B. ",
-        choices=["1B", "3B"],
-        required=True,
+        "--tokenizer_bin",
+        help="For Llama2. Pass Llama2 tokenizer binary.",
+        required=False,
         type=str,
     )
 
@@ -806,7 +908,7 @@ def main():
 
     parser.add_argument(
         "--system_prompt",
-        help="Tells the model what kind of assistant it should be. For example, You are a helpful AI assistant for travel tips and recommendations. Default is None",
+        help="For Llama3. Tells the model what kind of assistant it should be. For example, You are a helpful AI assistant for travel tips and recommendations. Default is None",
         default="",
         type=str,
     )
@@ -829,14 +931,14 @@ def main():
 
     parser.add_argument(
         "--pre_gen_pte",
-        help="Run the Pre-generated llama in the given directory",
+        help="Run the pre-generated llama in the given directory.",
         type=str,
     )
 
     parser.add_argument(
         "--num_sharding",
         type=int,
-        default=0,
+        default=1,
         help="Specify the number of splits by inserting the fallback custom op. The graph will be split evenly by layers.",
     )
 
@@ -862,31 +964,81 @@ def main():
         type=int,
     )
 
-    args = parser.parse_args()
+    parser.add_argument(
+        "--kv_updator",
+        help="Choose how to update kv cache during runtime",
+        choices=["smart_mask", "shift_pointer"],
+        default="smart_mask",
+        type=str,
+    )
+
+    parser.add_argument(
+        "-E",
+        "--embedding-quantize",
+        default=None,
+        type=str,
+        help="Fallback to cpu embedding operator and type of embedding quantization, '<bitwidth>,<groupsize>', e.g., '4,32'.",
+    )
+
+    return parser
+
+
+def main(args) -> None:
+    parser = _build_parser()
+
+    args = parser.parse_args(args)
     if args.compile_only and args.pre_gen_pte:
         exit("Cannot set both compile_only and pre_gen_pte as true")
 
     if args.model_mode == "kv":
-        pte_filename = "kv_llama3_2_qnn"
+        pte_filename = "kv_llama_qnn"
     elif args.model_mode == "prefill":
-        pte_filename = "prefill_llama3_2_qnn"
+        pte_filename = "prefill_llama_qnn"
     elif args.model_mode == "hybrid":
         assert (
             args.kv_seq_len >= args.prefill_seq_len
         ), "Please ensure kv_seq_len is >= prefill_seq_len"
-        pte_filename = "hybrid_llama3_2_qnn"
+        pte_filename = "hybrid_llama_qnn"
     else:
-        raise RuntimeError(f"No such model_mode {args.model_mode}.")
+        raise RuntimeError(f"Unknown model_mode: {args.model_mode}.")
+
+    tokenizer = get_tokenizer(args.tokenizer_model)
+    runtime_tokenizer_path = ""
+    if args.llama_model == "stories110m":
+        assert isinstance(
+            tokenizer, SentencePieceTokenizer
+        ), f"Wrong tokenizer provided for stories110m."
+        assert (
+            args.tokenizer_bin is not None
+        ), "Please provide tokenizer_bin for stories110m."
+        runtime_tokenizer_path = args.tokenizer_bin
+    elif args.llama_model == "llama3_2":
+        assert isinstance(
+            tokenizer, Tiktoken
+        ), f"Wrong tokenizer provided for llama3_2."
+        runtime_tokenizer_path = args.tokenizer_model
+    else:
+        raise RuntimeError(f"Unknown llama_model: {args.llama_model}.")
+
+    if args.kv_updator == "smart_mask":
+        args.shared_buffer = True
+        args.kv_updator = smart_mask_updator
+    elif args.kv_updator == "shift_pointer":
+        args.kv_updator = shift_pointer_updator
+    else:
+        exit(f"Using an unkown kv update {args.kv_updator}")
 
     if args.pre_gen_pte:
         quant_attrs = json.load(
             open(f"{args.pre_gen_pte}/{pte_filename}_quant_attrs.txt")
         )
-        inference(args, quant_attrs, pte_filename, args.pre_gen_pte)
+        inference(
+            args, quant_attrs, pte_filename, runtime_tokenizer_path, args.pre_gen_pte
+        )
         exit(f"Finish the running pre_gen_pte from {args.pre_gen_pte}")
 
     if args.compile_only:
-        quant_attrs = compile(args, pte_filename)
+        quant_attrs = compile(args, pte_filename, tokenizer)
         if quant_attrs:
             json.dump(
                 {
@@ -900,7 +1052,7 @@ def main():
         exit(f"Finish compile_only and save to {args.artifact}")
 
     try:
-        quant_attrs = compile(args, pte_filename)
+        quant_attrs = compile(args, pte_filename, tokenizer)
         if quant_attrs:
             logging.info(
                 f"Logit scale: {quant_attrs['scale']}; Logit offset: {quant_attrs['zero_point']}"
@@ -914,7 +1066,7 @@ def main():
             )
         else:
             logging.warning("Quant attributes of the logit is None.")
-        inference(args, quant_attrs, pte_filename)
+        inference(args, quant_attrs, pte_filename, runtime_tokenizer_path)
     except Exception as e:
         if args.ip and args.port != -1:
             with Client((args.ip, args.port)) as conn:
@@ -925,4 +1077,4 @@ def main():
 
 # flake8: noqa: C901
 if __name__ == "__main__":
-    main()
+    main(sys.argv[1:])
diff --git a/examples/qualcomm/oss_scripts/llama2/model/static_llama.py b/examples/qualcomm/oss_scripts/llama/model/static_llama.py
similarity index 97%
rename from examples/qualcomm/oss_scripts/llama2/model/static_llama.py
rename to examples/qualcomm/oss_scripts/llama/model/static_llama.py
index d1b618ed07..253abc9578 100755
--- a/examples/qualcomm/oss_scripts/llama2/model/static_llama.py
+++ b/examples/qualcomm/oss_scripts/llama/model/static_llama.py
@@ -12,10 +12,8 @@
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-from executorch.examples.models.llama.llama_transformer import (
-    ModelArgs,
-    precompute_freqs_cis,
-)
+from executorch.examples.models.llama.model_args import ModelArgs
+from executorch.examples.models.llama.rope import precompute_freqs_cis
 
 
 def apply_rotary_emb_single(
@@ -299,7 +297,9 @@ def forward(
 
 
 class LlamaModel(nn.Module):
-    def __init__(self, config: ModelArgs, output_new_cache_only=True):
+    def __init__(
+        self, config: ModelArgs, output_new_cache_only=True, use_i64_token=False
+    ):
         super().__init__()
         self.dim = config.dim
         self.head_dim = config.dim // config.n_heads
@@ -312,6 +312,7 @@ def __init__(self, config: ModelArgs, output_new_cache_only=True):
         self.rope_freq_base = config.rope_freq_base
         self.use_kv_cache = config.use_kv_cache
         self.output_new_cache_only = output_new_cache_only
+        self.use_i64_token = use_i64_token
 
         self.layers = nn.ModuleList(
             [
@@ -390,10 +391,12 @@ def forward(
         return logits, output_k_cache, output_v_cache
 
     def get_example_inputs(self, use_kv_cache=True):
+        dtype = torch.int64 if self.use_i64_token else torch.int32
         if use_kv_cache:
             tokens = torch.randint(
-                self.vocab_size, (self.max_batch_size, 1), dtype=torch.int32
+                self.vocab_size, (self.max_batch_size, 1), dtype=dtype
             )
+
             pos_ids = torch.zeros((self.max_batch_size, 1), dtype=torch.int32)
             k_cache, v_cache = [], []
             atten_mask = torch.full((self.max_batch_size, self.max_seq_len), -255.0)
@@ -424,7 +427,7 @@ def get_example_inputs(self, use_kv_cache=True):
             )
 
         max_promp = self.max_seq_len - 1
-        tokens = torch.arange(0, max_promp, 1, dtype=torch.int32).unsqueeze(0)
+        tokens = torch.arange(0, max_promp, 1, dtype=dtype).unsqueeze(0)
         atten_mask = torch.triu(torch.rand((max_promp, max_promp)), 1)
         atten_mask[atten_mask != 0] = -255
         return (
diff --git a/examples/qualcomm/oss_scripts/llama3_2/qnn_llama3_2_runner.cpp b/examples/qualcomm/oss_scripts/llama/qnn_llama_runner.cpp
similarity index 83%
rename from examples/qualcomm/oss_scripts/llama3_2/qnn_llama3_2_runner.cpp
rename to examples/qualcomm/oss_scripts/llama/qnn_llama_runner.cpp
index 2af882580e..1bc90a11f9 100644
--- a/examples/qualcomm/oss_scripts/llama3_2/qnn_llama3_2_runner.cpp
+++ b/examples/qualcomm/oss_scripts/llama/qnn_llama_runner.cpp
@@ -9,12 +9,13 @@
 /**
  * @file
  *
- * This tool can run Llama3.2 1B/3B with Qualcomm AI Engine Direct.
+ * This tool can run Llama2 110M, Llama3.2 1B / 3B(WIP) with Qualcomm AI Engine
+ * Direct.
  *
  */
 
 #include <executorch/backends/qualcomm/runtime/QnnExecuTorch.h>
-#include <executorch/examples/qualcomm/oss_scripts/llama3_2/runner/runner.h>
+#include <executorch/examples/qualcomm/oss_scripts/llama/runner/runner.h>
 #include <executorch/runtime/platform/log.h>
 #include <gflags/gflags.h>
 #include <fstream>
@@ -22,7 +23,7 @@
 
 DEFINE_string(
     model_path,
-    "qnn_llama2.pte",
+    "kv_llama_qnn.pte",
     "Model serialized in flatbuffer format.");
 
 DEFINE_string(
@@ -42,14 +43,18 @@ DEFINE_double(
 DEFINE_int32(
     seq_len,
     128,
-    "Total number of tokens to generate (prompt + output). Defaults to max_seq_len. If the number of input tokens + seq_len > max_seq_len, the output will be truncated to max_seq_len tokens.");
+    "Total number of tokens to generate (prompt + output).");
 
 DEFINE_int32(
     eval_mode,
-    0,
+    1,
     "0: PromptProcessor(prefill) / 1: TokenGenerator(kv) / 2: HybridMode (prefill+kv)");
 DEFINE_double(logits_scale, 0.0, "Logits scale");
 DEFINE_int32(logits_offset, 0, "Logits offset");
+DEFINE_string(
+    kv_updator,
+    "How to update kv cache. Choose between SmartMask and ShiftPointer",
+    "SmartMask");
 
 int main(int argc, char** argv) {
   gflags::ParseCommandLineFlags(&argc, &argv, true);
@@ -61,7 +66,8 @@ int main(int argc, char** argv) {
       FLAGS_logits_scale,
       FLAGS_logits_offset,
       FLAGS_temperature,
-      FLAGS_eval_mode);
+      FLAGS_eval_mode,
+      FLAGS_kv_updator);
   std::vector<char> buf;
   buf.reserve(5 * FLAGS_seq_len); // assume each token is around 5 char
   std::ofstream fout(FLAGS_output_path.c_str());
diff --git a/examples/qualcomm/oss_scripts/llama/runner/io_manager.cpp b/examples/qualcomm/oss_scripts/llama/runner/io_manager.cpp
new file mode 100644
index 0000000000..7992913a58
--- /dev/null
+++ b/examples/qualcomm/oss_scripts/llama/runner/io_manager.cpp
@@ -0,0 +1,1126 @@
+/*
+ * Copyright (c) Qualcomm Innovation Center, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/examples/qualcomm/oss_scripts/llama/runner/io_manager.h>
+#include <executorch/runtime/core/exec_aten/util/tensor_util.h>
+#include <algorithm>
+
+using executorch::aten::Tensor;
+using executorch::aten::TensorImpl;
+using executorch::extension::Module;
+using executorch::runtime::Error;
+using executorch::runtime::MemoryAllocator;
+using executorch::runtime::MethodMeta;
+using executorch::runtime::Result;
+using executorch::runtime::TensorInfo;
+
+namespace example {
+
+IoMgrBase::IoMgrBase(std::vector<std::shared_ptr<Module>>& modules)
+    : data_ptr_(nullptr, [](void*) {}), modules_(modules) {}
+
+IoMgrBase::~IoMgrBase() {}
+
+void* IoMgrBase::get_mutable_ptr() {
+  return data_ptr_.get();
+}
+
+std::vector<Tensor> IoMgrBase::get_input_tensors(
+    int shard_index,
+    const std::string& method_name) {
+  std::vector<Tensor> ret;
+  ret.reserve(input_tensors_.size());
+  for (TensorImpl* impl : input_tensors_[method_name][shard_index]) {
+    ret.emplace_back(Tensor(impl));
+  }
+  return ret;
+}
+
+std::vector<Tensor> IoMgrBase::get_output_tensors(
+    int shard_index,
+    const std::string& method_name) {
+  std::vector<Tensor> ret;
+  ret.reserve(output_tensors_[method_name][shard_index].size());
+  for (TensorImpl* impl : output_tensors_[method_name][shard_index]) {
+    ret.emplace_back(Tensor(impl));
+  }
+  return ret;
+}
+
+ShiftPointerIoMgr::ShiftPointerIoMgr(
+    std::vector<std::shared_ptr<Module>>& modules,
+    int32_t prefill_cache_len,
+    int32_t kv_cache_len,
+    int32_t vocab_size,
+    int32_t num_layers,
+    int32_t head_dim,
+    int32_t num_heads,
+    EvalMode eval_mode,
+    const std::string& prefill_forward_name,
+    const std::string& kv_forward_name,
+    const bool use_int64_token)
+    : IoMgrBase(modules),
+      shard_layers_({num_layers}),
+      kv_cache_len_(kv_cache_len),
+      prefill_cache_len_(prefill_cache_len),
+      vocab_size_(vocab_size),
+      num_layers_(num_layers),
+      head_dim_(head_dim),
+      num_heads_(num_heads),
+      eval_mode_(eval_mode),
+      prefill_forward_name_(prefill_forward_name),
+      kv_forward_name_(kv_forward_name),
+      use_int64_token_(use_int64_token) {
+  if (!prefill_forward_name_.empty()) {
+    input_tensors_[prefill_forward_name_] =
+        std::vector<std::vector<executorch::aten::TensorImpl*>>(modules.size());
+    output_tensors_[prefill_forward_name_] =
+        std::vector<std::vector<executorch::aten::TensorImpl*>>(modules.size());
+    k_cache_in_[prefill_forward_name_] =
+        std::vector<std::unique_ptr<executorch::aten::TensorImpl>>();
+    v_cache_in_[prefill_forward_name_] =
+        std::vector<std::unique_ptr<executorch::aten::TensorImpl>>();
+    k_cache_out_[prefill_forward_name_] =
+        std::vector<std::unique_ptr<executorch::aten::TensorImpl>>();
+    v_cache_out_[prefill_forward_name_] =
+        std::vector<std::unique_ptr<executorch::aten::TensorImpl>>();
+  }
+  if (!kv_forward_name_.empty()) {
+    input_tensors_[kv_forward_name_] =
+        std::vector<std::vector<executorch::aten::TensorImpl*>>(modules.size());
+    output_tensors_[kv_forward_name_] =
+        std::vector<std::vector<executorch::aten::TensorImpl*>>(modules.size());
+    k_cache_in_[kv_forward_name_] =
+        std::vector<std::unique_ptr<executorch::aten::TensorImpl>>();
+    v_cache_in_[kv_forward_name_] =
+        std::vector<std::unique_ptr<executorch::aten::TensorImpl>>();
+    k_cache_out_[kv_forward_name_] =
+        std::vector<std::unique_ptr<executorch::aten::TensorImpl>>();
+    v_cache_out_[kv_forward_name_] =
+        std::vector<std::unique_ptr<executorch::aten::TensorImpl>>();
+  }
+
+  data_ptr_ = std::unique_ptr<void, void (*)(void*)>(
+      new IO, [](void* ptr) { delete static_cast<IO*>(ptr); });
+}
+
+void ShiftPointerIoMgr::init_io() {
+  IO* ptr = static_cast<IO*>(data_ptr_.get());
+  std::memset(ptr, 0, sizeof(IO));
+
+  int32_t max_cache_len = std::max(kv_cache_len_, prefill_cache_len_);
+  int32_t k_in_size = (head_dim_ + 1) * max_cache_len;
+  int32_t v_cache_size = (num_heads_ + 1) * max_cache_len * head_dim_;
+  int32_t k_cache_out_size = num_heads_ * head_dim_;
+  if (eval_mode_ == EvalMode::kHybrid || eval_mode_ == EvalMode::kPrefill) {
+    k_cache_out_size *= prefill_cache_len_;
+  }
+
+  // Init kv vector shape, general enough to be shared across all 3 modes.
+  ptr->k_cache_out.reserve(num_layers_);
+  ptr->v_cache.reserve(num_layers_);
+  for (int layer = 0; layer < num_layers_; layer++) {
+    ptr->k_cache_out.emplace_back(std::vector<uint8_t>(k_cache_out_size));
+    ptr->v_cache.emplace_back(std::vector<uint8_t>(v_cache_size));
+  }
+
+  auto init_prefill = [&]() {
+    ptr->prefill_input_toks.resize(prefill_cache_len_);
+    ptr->prefill_atten_mask.resize(prefill_cache_len_ * prefill_cache_len_);
+    ptr->prefill_logits.resize(prefill_cache_len_ * vocab_size_);
+  };
+
+  auto init_kv = [&]() {
+    ptr->kv_logits.resize(vocab_size_);
+    ptr->kv_attention_mask.resize((kv_cache_len_ + 1), 0);
+    ptr->k_cache.reserve(num_layers_);
+    for (int layer = 0; layer < num_layers_; layer++) {
+      ptr->k_cache.emplace_back();
+      ptr->k_cache[layer].reserve(num_heads_);
+      for (int head = 0; head < num_heads_; head++) {
+        ptr->k_cache[layer].emplace_back(std::vector<uint8_t>(k_in_size));
+      }
+    }
+  };
+
+  switch (eval_mode_) {
+    case EvalMode::kPrefill:
+      init_prefill();
+      break;
+    case EvalMode::kKVCached:
+      init_kv();
+      break;
+    case EvalMode::kHybrid:
+      init_prefill();
+      init_kv();
+      break;
+    default:
+      break;
+  }
+}
+
+void ShiftPointerIoMgr::prepare_kv_io(
+    const std::vector<Result<MethodMeta>>& methods_meta) {
+  for (int i = 0; i < modules_.size(); ++i) {
+    ET_CHECK_MSG(
+        methods_meta[i].ok(),
+        "Failed to get method_meta 0x%x",
+        static_cast<uint32_t>(methods_meta[i].error()));
+  }
+
+  ET_CHECK_MSG(!(kv_forward_name_.empty()), "kv forward name is empty");
+  IO* ptr = static_cast<IO*>(data_ptr_.get());
+
+  // [I]: input_tokens
+  Result<TensorInfo> input_tok = methods_meta[0]->input_tensor_meta(0);
+  input_tok_ = std::make_unique<TensorImpl>(
+      input_tok->scalar_type(),
+      input_tok->sizes().size(),
+      const_cast<TensorImpl::SizesType*>(input_tok->sizes().data()),
+      &ptr->input_tok,
+      const_cast<TensorImpl::DimOrderType*>(input_tok->dim_order().data()));
+  input_tensors_[kv_forward_name_][0].push_back(input_tok_.get());
+
+  // [I]: atten_mask
+  Result<TensorInfo> atten_mask = methods_meta[0]->input_tensor_meta(1);
+  attention_mask_ = std::make_unique<TensorImpl>(
+      atten_mask->scalar_type(),
+      atten_mask->sizes().size(),
+      const_cast<TensorImpl::SizesType*>(atten_mask->sizes().data()),
+      ptr->kv_attention_mask.data(),
+      const_cast<TensorImpl::DimOrderType*>(atten_mask->dim_order().data()));
+  input_tensors_[kv_forward_name_][0].push_back(attention_mask_.get());
+
+  // [I]: input_pos
+  Result<TensorInfo> input_pos = methods_meta[0]->input_tensor_meta(2);
+  input_pos_ = std::make_unique<TensorImpl>(
+      input_pos->scalar_type(),
+      input_pos->sizes().size(),
+      const_cast<TensorImpl::SizesType*>(input_pos->sizes().data()),
+      &ptr->input_pos,
+      const_cast<TensorImpl::DimOrderType*>(input_pos->dim_order().data()));
+  input_tensors_[kv_forward_name_][0].push_back(input_pos_.get());
+
+  // [I] kv_cache
+  int index = 3; // bypass input_tokens, input_pos, atten_mask
+  for (int offset = 0, shard_index = 0, v_stride = kv_cache_len_ * head_dim_;
+       shard_index < modules_.size();
+       offset += shard_layers_[shard_index], shard_index++) {
+    for (int cache_group = 0; cache_group < 2; ++cache_group) {
+      for (int layer = 0; layer < shard_layers_[shard_index]; ++layer) {
+        for (int head = 0; head < num_heads_; ++head, ++index) {
+          Result<TensorInfo> kv_cache =
+              methods_meta[shard_index]->input_tensor_meta(index);
+          std::vector<std::unique_ptr<TensorImpl>>& cache =
+              (cache_group == 0 ? k_cache_in_[kv_forward_name_]
+                                : v_cache_in_[kv_forward_name_]);
+          void* cache_ptr = (cache_group == 0)
+              ? static_cast<void*>(ptr->k_cache[layer + offset][head].data())
+              : static_cast<void*>(
+                    ptr->v_cache[layer + offset].data() + head * v_stride);
+
+          cache.emplace_back(std::make_unique<TensorImpl>(
+              kv_cache->scalar_type(),
+              kv_cache->sizes().size(),
+              const_cast<TensorImpl::SizesType*>(kv_cache->sizes().data()),
+              cache_ptr,
+              const_cast<TensorImpl::DimOrderType*>(
+                  kv_cache->dim_order().data())));
+          input_tensors_[kv_forward_name_][shard_index].push_back(
+              cache.back().get());
+        }
+      }
+    }
+  }
+
+  // [O]: logits
+  int logit_index = 0;
+  Result<TensorInfo> logits =
+      methods_meta[modules_.size() - 1]->output_tensor_meta(logit_index);
+  kv_logits_ = std::make_unique<TensorImpl>(
+      logits->scalar_type(),
+      logits->sizes().size(),
+      const_cast<TensorImpl::SizesType*>(logits->sizes().data()),
+      ptr->kv_logits.data(),
+      const_cast<TensorImpl::DimOrderType*>(logits->dim_order().data()));
+  output_tensors_[kv_forward_name_][modules_.size() - 1].push_back(
+      kv_logits_.get());
+
+  // [O] kv_cache
+  index = 1;
+  // Iterate through all kv cache outputs.
+  // For k, we store it in k_cache_out and update to k_cache later.
+  // For v, we append the output to the end of v_cache,
+  // which serves as both input and output.
+  for (int offset = 0, shard_index = 0, v_stride = kv_cache_len_ * head_dim_;
+       shard_index < modules_.size();
+       offset += shard_layers_[shard_index], shard_index++) {
+    for (int cache_group = 0; cache_group < 2; ++cache_group) {
+      for (int layer = 0; layer < shard_layers_[shard_index]; ++layer) {
+        for (int head = 0; head < num_heads_; ++head, ++index) {
+          Result<TensorInfo> kv_cache =
+              methods_meta[shard_index]->output_tensor_meta(index);
+          std::vector<std::unique_ptr<TensorImpl>>& cache =
+              (cache_group == 0 ? k_cache_out_[kv_forward_name_]
+                                : v_cache_out_[kv_forward_name_]);
+          void* cache_ptr = (cache_group == 0)
+              ? static_cast<void*>(
+                    ptr->k_cache_out[layer + offset].data() +
+                    (head * head_dim_))
+              : static_cast<void*>(
+                    ptr->v_cache[layer + offset].data() +
+                    (head + 1) * v_stride);
+          cache.emplace_back(std::make_unique<TensorImpl>(
+              kv_cache->scalar_type(),
+              kv_cache->sizes().size(),
+              const_cast<TensorImpl::SizesType*>(kv_cache->sizes().data()),
+              cache_ptr,
+              const_cast<TensorImpl::DimOrderType*>(
+                  kv_cache->dim_order().data())));
+          output_tensors_[kv_forward_name_][shard_index].push_back(
+              cache.back().get());
+        }
+      }
+    }
+  }
+}
+
+void ShiftPointerIoMgr::prepare_prefill_io(
+    const std::vector<Result<MethodMeta>>& methods_meta) {
+  for (int i = 0; i < modules_.size(); ++i) {
+    ET_CHECK_MSG(
+        methods_meta[i].ok(),
+        "Failed to get method_meta 0x%x",
+        static_cast<uint32_t>(methods_meta[i].error()));
+  }
+
+  ET_CHECK_MSG(
+      !(prefill_forward_name_.empty()), "prefill forward name is empty");
+
+  IO* ptr = static_cast<IO*>(data_ptr_.get());
+
+  // [I]: pre_input_tokens
+  Result<TensorInfo> prefill_input_toks = methods_meta[0]->input_tensor_meta(0);
+  prefill_input_toks_ = std::make_unique<TensorImpl>(
+      prefill_input_toks->scalar_type(),
+      prefill_input_toks->sizes().size(),
+      const_cast<TensorImpl::SizesType*>(prefill_input_toks->sizes().data()),
+      ptr->prefill_input_toks.data(),
+      const_cast<TensorImpl::DimOrderType*>(
+          prefill_input_toks->dim_order().data()));
+  input_tensors_[prefill_forward_name_][0].push_back(prefill_input_toks_.get());
+  // [I]: prefill_attn_mask
+  for (int i = 0; i < prefill_cache_len_; ++i) {
+    for (int j = 0; j < prefill_cache_len_; ++j) {
+      if (i < j) {
+        ptr->prefill_atten_mask[i * prefill_cache_len_ + j] = 0;
+      } else {
+        ptr->prefill_atten_mask[i * prefill_cache_len_ + j] = 65535;
+      }
+    }
+  }
+  Result<TensorInfo> prefill_atten_mask = methods_meta[0]->input_tensor_meta(1);
+  prefill_attn_mask_ = std::make_unique<TensorImpl>(
+      prefill_atten_mask->scalar_type(),
+      prefill_atten_mask->sizes().size(),
+      const_cast<TensorImpl::SizesType*>(prefill_atten_mask->sizes().data()),
+      ptr->prefill_atten_mask.data(),
+      const_cast<TensorImpl::DimOrderType*>(
+          prefill_atten_mask->dim_order().data()));
+  input_tensors_[prefill_forward_name_][0].push_back(prefill_attn_mask_.get());
+  // [O]: logits
+  int logit_index = 0;
+  Result<TensorInfo> logits =
+      methods_meta[modules_.size() - 1]->output_tensor_meta(logit_index);
+  prefill_logits_ = std::make_unique<TensorImpl>(
+      logits->scalar_type(),
+      logits->sizes().size(),
+      const_cast<TensorImpl::SizesType*>(logits->sizes().data()),
+      ptr->prefill_logits.data(),
+      const_cast<TensorImpl::DimOrderType*>(logits->dim_order().data()));
+  output_tensors_[prefill_forward_name_][modules_.size() - 1].push_back(
+      prefill_logits_.get());
+
+  // [O] kv_cache
+  int index = 1;
+  // prefill_k_stride should be equal to prefill_v_stride in prefill mode.
+  // In hybrid mode, we use kv mode cache len for v stride since we want to
+  // update prefill's result onto kv modes input.
+  int32_t prefill_k_stride = prefill_cache_len_ * head_dim_;
+  int32_t prefill_v_stride =
+      std::max(prefill_cache_len_, kv_cache_len_) * head_dim_;
+
+  if (eval_mode_ == EvalMode::kPrefill) {
+    ET_CHECK_MSG(
+        prefill_k_stride == prefill_v_stride,
+        "prefill_k_stride should be equal to prefill_v_stride");
+  }
+  for (int offset = 0, shard_index = 0; shard_index < modules_.size();
+       offset += shard_layers_[shard_index], shard_index++) {
+    for (int cache_group = 0; cache_group < 2; ++cache_group) {
+      for (int layer = 0; layer < shard_layers_[shard_index]; ++layer) {
+        for (int head = 0; head < num_heads_; ++head, ++index) {
+          Result<TensorInfo> kv_cache =
+              methods_meta[shard_index]->output_tensor_meta(index);
+          std::vector<std::unique_ptr<TensorImpl>>& cache =
+              (cache_group == 0 ? k_cache_out_[prefill_forward_name_]
+                                : v_cache_out_[prefill_forward_name_]);
+          void* cache_ptr = (cache_group == 0)
+              ? static_cast<void*>(
+                    ptr->k_cache_out[layer + offset].data() +
+                    head * prefill_k_stride)
+              : static_cast<void*>(
+                    ptr->v_cache[layer + offset].data() +
+                    (head + 1) * prefill_v_stride);
+          cache.emplace_back(std::make_unique<TensorImpl>(
+              kv_cache->scalar_type(),
+              kv_cache->sizes().size(),
+              const_cast<TensorImpl::SizesType*>(kv_cache->sizes().data()),
+              cache_ptr,
+              const_cast<TensorImpl::DimOrderType*>(
+                  kv_cache->dim_order().data())));
+          output_tensors_[prefill_forward_name_][shard_index].push_back(
+              cache.back().get());
+        }
+      }
+    }
+  }
+}
+
+void ShiftPointerIoMgr::update_prefill_to_kv_io(
+    int64_t cur_token,
+    int64_t pos,
+    std::vector<std::vector<Tensor>>& output_tensors) {
+  ET_CHECK_MSG(kv_cache_len_ != 0, "k_cache_len_ should not equal to 0");
+  ET_CHECK_MSG(
+      prefill_cache_len_ != 0, "prefill_cache_len_ should not equal to 0");
+  IO* ptr = static_cast<IO*>(data_ptr_.get());
+
+  ptr->input_tok =
+      use_int64_token_ ? cur_token : static_cast<int32_t>(cur_token);
+  ptr->input_pos = static_cast<int32_t>(pos);
+  // If prompt len is 30, prefill will handle to pos = 30.
+  // At this point, pos should be 31.
+  for (int i = 0; i < pos + 1; i++) {
+    ptr->kv_attention_mask[kv_cache_len_ - i] = 65535;
+  }
+
+  // update v_cache
+  std::vector<std::unique_ptr<executorch::aten::TensorImpl>>& v_cache_in =
+      v_cache_in_[kv_forward_name_];
+  std::vector<std::unique_ptr<executorch::aten::TensorImpl>>& v_cache_out =
+      v_cache_out_[kv_forward_name_];
+  for (int i = 0, v_cache_stride = head_dim_ * pos; i < v_cache_in.size();
+       i++) {
+    v_cache_in[i]->set_data(
+        v_cache_in[i]->mutable_data<uint8_t>() + v_cache_stride);
+    v_cache_out[i]->set_data(
+        v_cache_out[i]->mutable_data<uint8_t>() + v_cache_stride);
+  }
+  for (int shard = 0; shard < output_tensors.size(); shard++) {
+    for (int index = 0; index < output_tensors[shard].size(); index++) {
+      ET_CHECK_MSG(
+          modules_[shard]->set_output(
+              kv_forward_name_, output_tensors[shard][index], index) ==
+              Error::Ok,
+          "Failed to set output tensor for module %d's %d'th output "
+          "while updating kv_cache output tensors",
+          shard,
+          index);
+    }
+  }
+
+  std::vector<std::unique_ptr<executorch::aten::TensorImpl>>& k_cache_in =
+      k_cache_in_[kv_forward_name_];
+  std::vector<std::unique_ptr<executorch::aten::TensorImpl>>& k_cache_out =
+      k_cache_out_[prefill_forward_name_];
+  for (int i = 0; i < k_cache_in.size(); ++i) {
+    uint8_t* ptr_in = k_cache_in[i]->mutable_data<uint8_t>();
+    const uint8_t* ptr_out = k_cache_out[i]->data<uint8_t>();
+    for (size_t j = 0, offset = kv_cache_len_; j < head_dim_;
+         ++j, offset += kv_cache_len_) {
+      for (int k = 0, k_stride = j * prefill_cache_len_; k < pos; k++) {
+        ptr_in[offset + k] = ptr_out[k_stride + k];
+      }
+    }
+    k_cache_in[i]->set_data(ptr_in + pos);
+  }
+}
+
+void ShiftPointerIoMgr::update_kv_io(
+    int64_t cur_token,
+    int64_t pos,
+    std::vector<std::vector<Tensor>>& output_tensors) {
+  IO* ptr = static_cast<IO*>(data_ptr_.get());
+  // update input_tok
+  ptr->input_tok =
+      use_int64_token_ ? cur_token : static_cast<int32_t>(cur_token);
+  // update position_ids
+  ptr->input_pos = static_cast<int32_t>(pos);
+  // update causal mask for next token
+  ptr->kv_attention_mask[kv_cache_len_ - pos] = 65535;
+
+  // update v_cache
+  auto& v_cache_in = v_cache_in_[kv_forward_name_];
+  auto& v_cache_out = v_cache_out_[kv_forward_name_];
+  for (int i = 0; i < v_cache_in.size(); i++) {
+    v_cache_in[i]->set_data(v_cache_in[i]->mutable_data<uint8_t>() + head_dim_);
+    v_cache_out[i]->set_data(
+        v_cache_out[i]->mutable_data<uint8_t>() + head_dim_);
+  }
+
+  for (int shard = 0; shard < output_tensors.size(); shard++) {
+    for (int index = 0; index < output_tensors[shard].size(); index++) {
+      ET_CHECK_MSG(
+          modules_[shard]->set_output(
+              kv_forward_name_, output_tensors[shard][index], index) ==
+              Error::Ok,
+          "failed to set output tensor for module %d's %d'th output "
+          "while updating kv_cache output tensors",
+          shard,
+          index);
+    }
+  }
+
+  auto& k_cache_in = k_cache_in_[kv_forward_name_];
+  auto& k_cache_out = k_cache_out_[kv_forward_name_];
+  // update k_cache by single thread, this part is cpu cache sensitive
+  for (int i = 0; i < k_cache_in.size(); ++i) {
+    uint8_t* ptr_in = k_cache_in[i]->mutable_data<uint8_t>();
+    const uint8_t* ptr_out = k_cache_out[i]->data<uint8_t>();
+    for (size_t j = 0, offset = kv_cache_len_; j < head_dim_;
+         ++j, offset += kv_cache_len_) {
+      ptr_in[offset] = ptr_out[j];
+    }
+    k_cache_in[i]->set_data(ptr_in + 1);
+  }
+}
+
+void ShiftPointerIoMgr::update_prefill_io(
+    int64_t cur_token,
+    int64_t pos,
+    std::vector<std::vector<Tensor>>& output_tensors) {
+  (void)output_tensors;
+  IO* ptr = static_cast<IO*>(data_ptr_.get());
+  // Support CPU 4-bit embedding, which requires int64 input.
+  // However, for QNN embedding, only int32 input is needed.
+  // Therefore, we need to cast to the correct type to write the data.
+  if (use_int64_token_) {
+    ptr->prefill_input_toks[pos] = cur_token;
+  } else {
+    int32_t* prefill_input_toks_ptr =
+        reinterpret_cast<int32_t*>(ptr->prefill_input_toks.data());
+    prefill_input_toks_ptr[pos] = static_cast<int32_t>(cur_token);
+  }
+}
+
+void ShiftPointerIoMgr::fill_prefill_toks(
+    std::vector<uint64_t>& prompt_tokens) {
+  IO* ptr = static_cast<IO*>(get_mutable_ptr());
+  for (int i = 0; i < prompt_tokens.size(); i++) {
+    // Support CPU 4-bit embedding, which requires int64 input.
+    // However, for QNN embedding, only int32 input is needed.
+    // Therefore, we need to cast to the correct type to write the data.
+    if (use_int64_token_) {
+      ptr->prefill_input_toks[i] = prompt_tokens[i];
+    } else {
+      int32_t* prefill_input_toks_ptr =
+          reinterpret_cast<int32_t*>(ptr->prefill_input_toks.data());
+      prefill_input_toks_ptr[i] = static_cast<int32_t>(prompt_tokens[i]);
+    }
+  }
+}
+
+void ShiftPointerIoMgr::fill_kv_tok_mask(int64_t pos, int64_t cur_token) {
+  IO* ptr = static_cast<IO*>(get_mutable_ptr());
+  ptr->input_tok =
+      use_int64_token_ ? cur_token : static_cast<int32_t>(cur_token);
+  ptr->kv_attention_mask[kv_cache_len_] = 65535;
+}
+
+SmartMaskIoMgr::SmartMaskIoMgr(
+    std::vector<std::shared_ptr<Module>>& modules,
+    int32_t prefill_cache_len,
+    int32_t kv_cache_len,
+    int32_t vocab_size,
+    int32_t num_layers,
+    int32_t head_dim,
+    int32_t num_heads,
+    EvalMode eval_mode,
+    const std::string& prefill_forward_name,
+    const std::string& kv_forward_name,
+    const bool use_int64_token)
+    : IoMgrBase(modules),
+      shard_layers_({num_layers}),
+      prefill_cache_len_(prefill_cache_len),
+      kv_cache_len_(kv_cache_len),
+      vocab_size_(vocab_size),
+      num_layers_(num_layers),
+      head_dim_(head_dim),
+      num_heads_(num_heads),
+      eval_mode_(eval_mode),
+      prefill_forward_name_(prefill_forward_name),
+      kv_forward_name_(kv_forward_name),
+      use_int64_token_(use_int64_token) {
+  if (!prefill_forward_name_.empty()) {
+    input_tensors_[prefill_forward_name_] =
+        std::vector<std::vector<executorch::aten::TensorImpl*>>(modules.size());
+    output_tensors_[prefill_forward_name_] =
+        std::vector<std::vector<executorch::aten::TensorImpl*>>(modules.size());
+    k_cache_out_[prefill_forward_name_] =
+        std::vector<std::unique_ptr<executorch::aten::TensorImpl>>();
+    v_cache_out_[prefill_forward_name_] =
+        std::vector<std::unique_ptr<executorch::aten::TensorImpl>>();
+  }
+  if (!kv_forward_name_.empty()) {
+    input_tensors_[kv_forward_name_] =
+        std::vector<std::vector<executorch::aten::TensorImpl*>>(modules.size());
+    output_tensors_[kv_forward_name_] =
+        std::vector<std::vector<executorch::aten::TensorImpl*>>(modules.size());
+    k_cache_in_[kv_forward_name_] =
+        std::vector<std::unique_ptr<executorch::aten::TensorImpl>>();
+    v_cache_in_[kv_forward_name_] =
+        std::vector<std::unique_ptr<executorch::aten::TensorImpl>>();
+    k_cache_out_[kv_forward_name_] =
+        std::vector<std::unique_ptr<executorch::aten::TensorImpl>>();
+    v_cache_out_[kv_forward_name_] =
+        std::vector<std::unique_ptr<executorch::aten::TensorImpl>>();
+  }
+
+  data_ptr_ = std::unique_ptr<void, void (*)(void*)>(
+      new IO, [](void* ptr) { delete static_cast<IO*>(ptr); });
+}
+
+std::unordered_map<std::string, size_t> SmartMaskIoMgr::get_io_elements() {
+  size_t cache_len = std::max(kv_cache_len_, prefill_cache_len_);
+  size_t cache_in_ele = num_layers_ * num_heads_ * head_dim_ * cache_len;
+  size_t cache_out_ele = num_layers_ * num_heads_ * head_dim_;
+  return std::unordered_map<std::string, size_t>{
+      {"input_tok_ele", 1},
+      {"input_pos_ele", 1},
+      {"cache_in_ele", cache_in_ele},
+      {"cache_out_ele", cache_out_ele},
+      // 1 for the input prompt
+      {"atten_mask_ele", cache_len + 1},
+      {"kv_logits_ele", vocab_size_},
+      {"prefill_input_toks_ele", prefill_cache_len_},
+      {"prefill_atten_mask_ele", prefill_cache_len_ * prefill_cache_len_},
+      {"prefill_logits_ele", prefill_cache_len_ * vocab_size_}};
+}
+
+std::unordered_map<std::string, size_t> SmartMaskIoMgr::get_io_bytes() {
+  std::unordered_map<std::string, size_t> element_map = get_io_elements();
+  auto align = [](size_t byte) {
+    size_t alignment = MemoryAllocator::kDefaultAlignment;
+    return byte % alignment == 0 ? byte
+                                 : byte +
+            (static_cast<intptr_t>(alignment) -
+             byte % static_cast<intptr_t>(alignment));
+  };
+  return std::unordered_map<std::string, size_t>{
+      {"input_tok_bytes",
+       align(element_map["input_tok_ele"] * sizeof(int32_t))},
+      {"input_pos_bytes",
+       align(element_map["input_pos_ele"] * sizeof(int32_t))},
+      {"cache_in_bytes", align(element_map["cache_in_ele"] * sizeof(uint8_t))},
+      {"cache_out_bytes",
+       align(element_map["cache_out_ele"] * sizeof(uint8_t))},
+      {"atten_mask_bytes",
+       align(element_map["atten_mask_ele"] * sizeof(uint16_t))},
+      {"kv_logits_bytes",
+       align(element_map["kv_logits_ele"] * sizeof(uint16_t))},
+      {"prefill_input_toks_bytes",
+       align(element_map["prefill_input_toks_ele"] * sizeof(int32_t))},
+      {"prefill_atten_mask_bytes",
+       align(element_map["prefill_atten_mask_ele"] * sizeof(uint16_t))},
+      {"prefill_logits_bytes",
+       align(element_map["prefill_logits_ele"] * sizeof(uint16_t))}};
+}
+
+void SmartMaskIoMgr::IO::init_io_ptrs(
+    void* shared_buffer_ptr,
+    std::unordered_map<std::string, size_t>& io_bytes_map) {
+  shared_buffer_base = shared_buffer_ptr;
+  std::byte* cur_ptr = reinterpret_cast<std::byte*>(shared_buffer_base);
+  std::size_t cur_pos = 0;
+  size_t layered_head_count = num_layers_ * num_heads_;
+
+  // Iterate map so that we don't need to care about which mode is used.
+  for (const auto& iter : io_bytes_map) {
+    std::string key = iter.first;
+    size_t size = iter.second;
+    if (key == "input_tok_bytes") {
+      input_tok = reinterpret_cast<int64_t*>(cur_ptr);
+    } else if (key == "input_pos_bytes") {
+      input_pos = reinterpret_cast<int32_t*>(cur_ptr);
+    } else if (key == "cache_in_bytes" || key == "cache_out_bytes") {
+      auto& k_cache_ref = (key == "cache_in_bytes") ? k_cache : k_cache_out;
+      auto& v_cache_ref = (key == "cache_in_bytes") ? v_cache : v_cache_out;
+      size_t single_head_size = size / layered_head_count;
+      k_cache_ref.reserve(num_layers_);
+      v_cache_ref.reserve(num_layers_);
+      for (int i = 0; i < num_layers_; ++i) {
+        k_cache_ref[i].reserve(num_heads_);
+        v_cache_ref[i].reserve(num_heads_);
+        for (int j = 0; j < num_heads_; ++j) {
+          k_cache_ref[i][j] = reinterpret_cast<uint8_t*>(cur_ptr);
+          io_pos_map[cur_ptr] = cur_pos;
+          cur_ptr += single_head_size;
+          cur_pos += single_head_size;
+          v_cache_ref[i][j] = reinterpret_cast<uint8_t*>(cur_ptr);
+          io_pos_map[cur_ptr] = cur_pos;
+          cur_ptr += single_head_size;
+          cur_pos += single_head_size;
+        }
+      }
+      continue;
+    } else if (key == "atten_mask_bytes") {
+      kv_attention_mask = reinterpret_cast<uint16_t*>(cur_ptr);
+    } else if (key == "kv_logits_bytes") {
+      kv_logits = reinterpret_cast<uint16_t*>(cur_ptr);
+    } else if (key == "prefill_input_toks_bytes") {
+      prefill_input_toks = reinterpret_cast<int64_t*>(cur_ptr);
+    } else if (key == "prefill_atten_mask_bytes") {
+      prefill_atten_mask = reinterpret_cast<uint16_t*>(cur_ptr);
+    } else if (key == "prefill_logits_bytes") {
+      prefill_logits = reinterpret_cast<uint16_t*>(cur_ptr);
+    } else {
+      ET_LOG(Error, "Unknown pointer type: %s", key.c_str());
+    }
+
+    io_pos_map[cur_ptr] = cur_pos;
+    cur_ptr += size;
+    cur_pos += size;
+  }
+}
+
+void SmartMaskIoMgr::IO::add_custom_mem_info(
+    void* ptr,
+    size_t nbytes,
+    executorch::aten::ScalarType scalar_type,
+    executorch::runtime::TensorInfo& tensor_info) {
+  if (auto it = io_pos_map.find(static_cast<std::byte*>(ptr));
+      it == io_pos_map.end()) {
+    ET_LOG(Error, "Shared buffer pointer %p is not found", ptr);
+  }
+  size_t pos = io_pos_map[static_cast<std::byte*>(ptr)];
+  uint32_t rank = tensor_info.sizes().size();
+  uint32_t shape[rank];
+  CustomMemTensorInfo info = {
+      shared_buffer_base, ptr, pos, nbytes, shape, rank, scalar_type};
+  QnnExecuTorchAddCustomMemTensorInfo(info);
+}
+
+void SmartMaskIoMgr::init_io() {
+  std::unordered_map<std::string, size_t> io_bytes_map = get_io_bytes();
+
+  switch (eval_mode_) {
+    case EvalMode::kPrefill:
+      io_bytes_map.erase("input_tok_bytes");
+      io_bytes_map.erase("input_pos_bytes");
+      io_bytes_map.erase("atten_mask_bytes");
+      io_bytes_map.erase("kv_logits_bytes");
+      break;
+    case EvalMode::kKVCached:
+      io_bytes_map.erase("prefill_input_toks_bytes");
+      io_bytes_map.erase("prefill_atten_mask_bytes");
+      io_bytes_map.erase("prefill_logits_bytes");
+      break;
+    case EvalMode::kHybrid:
+      break;
+    default:
+      break;
+  }
+
+  size_t total_bytes = 0;
+  for (const auto& iter : io_bytes_map) {
+    size_t size = iter.second;
+    if (iter.first == "cache_in_bytes" || iter.first == "cache_out_bytes") {
+      size = iter.second * 2;
+    }
+    total_bytes += size;
+  }
+  void* shared_ptr = QnnExecuTorchAllocCustomMem(
+      total_bytes, MemoryAllocator::kDefaultAlignment);
+
+  ET_CHECK_MSG(
+      shared_ptr,
+      "Allocate Rpc mem falied, bytes=%zu, alignment=%zu",
+      total_bytes,
+      MemoryAllocator::kDefaultAlignment);
+  IO* ptr = static_cast<IO*>(data_ptr_.get());
+  ptr->num_heads_ = num_heads_;
+  ptr->num_layers_ = num_layers_;
+  ptr->head_dim_ = head_dim_;
+  ptr->init_io_ptrs(shared_ptr, io_bytes_map);
+}
+
+void SmartMaskIoMgr::prepare_kv_io(
+    const std::vector<Result<MethodMeta>>& methods_meta) {
+  for (int i = 0; i < modules_.size(); ++i) {
+    ET_CHECK_MSG(
+        methods_meta[i].ok(),
+        "Failed to get method_meta 0x%x",
+        static_cast<uint32_t>(methods_meta[i].error()));
+  }
+
+  ET_CHECK_MSG(!(kv_forward_name_.empty()), "kv forward name is empty");
+  IO* ptr = static_cast<IO*>(data_ptr_.get());
+  std::unordered_map<std::string, size_t> io_bytes_map = get_io_bytes();
+
+  // [I]: input_tokens
+  Result<TensorInfo> input_tok = methods_meta[0]->input_tensor_meta(0);
+  input_tok_ = std::make_unique<TensorImpl>(
+      input_tok->scalar_type(),
+      input_tok->sizes().size(),
+      const_cast<TensorImpl::SizesType*>(input_tok->sizes().data()),
+      ptr->input_tok,
+      const_cast<TensorImpl::DimOrderType*>(input_tok->dim_order().data()));
+  input_tensors_[kv_forward_name_][0].push_back(input_tok_.get());
+  ptr->add_custom_mem_info(
+      ptr->input_tok,
+      io_bytes_map["input_tok_bytes"],
+      input_tok->scalar_type(),
+      input_tok.get());
+
+  // [I]: atten_mask
+  Result<TensorInfo> atten_mask = methods_meta[0]->input_tensor_meta(1);
+  attention_mask_ = std::make_unique<TensorImpl>(
+      atten_mask->scalar_type(),
+      atten_mask->sizes().size(),
+      const_cast<TensorImpl::SizesType*>(atten_mask->sizes().data()),
+      ptr->kv_attention_mask,
+      const_cast<TensorImpl::DimOrderType*>(atten_mask->dim_order().data()));
+  input_tensors_[kv_forward_name_][0].push_back(attention_mask_.get());
+  ptr->add_custom_mem_info(
+      ptr->kv_attention_mask,
+      io_bytes_map["atten_mask_bytes"],
+      atten_mask->scalar_type(),
+      atten_mask.get());
+
+  // [I]: input_pos
+  Result<TensorInfo> input_pos = methods_meta[0]->input_tensor_meta(2);
+  input_pos_ = std::make_unique<TensorImpl>(
+      input_pos->scalar_type(),
+      input_pos->sizes().size(),
+      const_cast<TensorImpl::SizesType*>(input_pos->sizes().data()),
+      ptr->input_pos,
+      const_cast<TensorImpl::DimOrderType*>(input_pos->dim_order().data()));
+  input_tensors_[kv_forward_name_][0].push_back(input_pos_.get());
+  ptr->add_custom_mem_info(
+      ptr->input_pos,
+      io_bytes_map["input_pos_bytes"],
+      input_pos->scalar_type(),
+      input_pos.get());
+
+  // [I] kv_cache
+  size_t layered_head_count = num_layers_ * num_heads_;
+  int index = 3; // bypass input_tokens, input_pos, atten_mask
+  for (int offset = 0, shard_index = 0; shard_index < modules_.size();
+       offset += shard_layers_[shard_index], shard_index++) {
+    for (int cache_group = 0; cache_group < 2; ++cache_group) {
+      for (int layer = 0; layer < shard_layers_[shard_index]; ++layer) {
+        for (int head = 0; head < num_heads_; ++head, ++index) {
+          Result<TensorInfo> kv_cache =
+              methods_meta[shard_index]->input_tensor_meta(index);
+          std::vector<std::unique_ptr<TensorImpl>>& cache =
+              (cache_group == 0 ? k_cache_in_[kv_forward_name_]
+                                : v_cache_in_[kv_forward_name_]);
+          uint8_t* cache_ptr = (cache_group == 0)
+              ? ptr->k_cache[layer + offset][head]
+              : ptr->v_cache[layer + offset][head];
+
+          cache.emplace_back(std::make_unique<TensorImpl>(
+              kv_cache->scalar_type(),
+              kv_cache->sizes().size(),
+              const_cast<TensorImpl::SizesType*>(kv_cache->sizes().data()),
+              cache_ptr,
+              const_cast<TensorImpl::DimOrderType*>(
+                  kv_cache->dim_order().data())));
+          ptr->add_custom_mem_info(
+              cache_ptr,
+              io_bytes_map["cache_in_bytes"] / layered_head_count,
+              kv_cache->scalar_type(),
+              kv_cache.get());
+          input_tensors_[kv_forward_name_][shard_index].push_back(
+              cache.back().get());
+        }
+      }
+    }
+  }
+
+  // [O]: logits
+  int logit_index = 0;
+  Result<TensorInfo> logits =
+      methods_meta[modules_.size() - 1]->output_tensor_meta(logit_index);
+  kv_logits_ = std::make_unique<TensorImpl>(
+      logits->scalar_type(),
+      logits->sizes().size(),
+      const_cast<TensorImpl::SizesType*>(logits->sizes().data()),
+      ptr->kv_logits,
+      const_cast<TensorImpl::DimOrderType*>(logits->dim_order().data()));
+
+  ptr->add_custom_mem_info(
+      ptr->kv_logits,
+      io_bytes_map["kv_logits_bytes"],
+      logits->scalar_type(),
+      logits.get());
+  output_tensors_[kv_forward_name_][modules_.size() - 1].push_back(
+      kv_logits_.get());
+
+  // [O] kv_cache
+  index = 1;
+  for (int offset = 0, shard_index = 0; shard_index < modules_.size();
+       offset += shard_layers_[shard_index], shard_index++) {
+    for (int cache_group = 0; cache_group < 2; ++cache_group) {
+      for (int layer = 0; layer < shard_layers_[shard_index]; ++layer) {
+        for (int head = 0; head < num_heads_; ++head, ++index) {
+          Result<TensorInfo> kv_cache =
+              methods_meta[shard_index]->output_tensor_meta(index);
+          std::vector<std::unique_ptr<TensorImpl>>& cache =
+              (cache_group == 0 ? k_cache_out_[kv_forward_name_]
+                                : v_cache_out_[kv_forward_name_]);
+          uint8_t* cache_ptr = (cache_group == 0)
+              ? ptr->k_cache_out[layer + offset][head]
+              : ptr->v_cache_out[layer + offset][head];
+          cache.emplace_back(std::make_unique<TensorImpl>(
+              kv_cache->scalar_type(),
+              kv_cache->sizes().size(),
+              const_cast<TensorImpl::SizesType*>(kv_cache->sizes().data()),
+              cache_ptr,
+              const_cast<TensorImpl::DimOrderType*>(
+                  kv_cache->dim_order().data())));
+          ptr->add_custom_mem_info(
+              cache_ptr,
+              io_bytes_map["cache_out_bytes"] / layered_head_count,
+              kv_cache->scalar_type(),
+              kv_cache.get());
+          output_tensors_[kv_forward_name_][shard_index].push_back(
+              cache.back().get());
+        }
+      }
+    }
+  }
+}
+
+void SmartMaskIoMgr::update_kv_io(
+    int64_t cur_token,
+    int64_t pos,
+    std::vector<std::vector<Tensor>>& output_tensors) {
+  IO* ptr = static_cast<IO*>(data_ptr_.get());
+  size_t cache_len = std::max(kv_cache_len_, prefill_cache_len_);
+  // update input_tok
+  *ptr->input_tok =
+      use_int64_token_ ? cur_token : static_cast<int32_t>(cur_token);
+  // update position_ids
+  *ptr->input_pos = static_cast<int32_t>(pos);
+  // update smart mask for previous cache
+  ptr->kv_attention_mask[pos] = 65535;
+
+  // update v_cache
+  auto& v_cache_in = v_cache_in_[kv_forward_name_];
+  auto& v_cache_out = v_cache_out_[kv_forward_name_];
+  // update v_cache by single thread, this part is cpu cache sensitive
+  for (int i = 0; i < v_cache_in.size(); ++i) {
+    uint8_t* ptr_in = v_cache_in[i]->mutable_data<uint8_t>() + pos * head_dim_;
+    const uint8_t* ptr_out = v_cache_out[i]->data<uint8_t>();
+    memcpy(ptr_in, ptr_out, head_dim_ * sizeof(uint8_t));
+  }
+
+  auto& k_cache_in = k_cache_in_[kv_forward_name_];
+  auto& k_cache_out = k_cache_out_[kv_forward_name_];
+  for (int i = 0; i < k_cache_in.size(); ++i) {
+    uint8_t* ptr_in = k_cache_in[i]->mutable_data<uint8_t>() + pos;
+    const uint8_t* ptr_out = k_cache_out[i]->data<uint8_t>();
+    for (size_t j = 0, offset = 0; j < head_dim_; ++j, offset += cache_len) {
+      ptr_in[offset] = ptr_out[j];
+    }
+  }
+}
+
+void SmartMaskIoMgr::prepare_prefill_io(
+    const std::vector<Result<MethodMeta>>& methods_meta) {
+  for (int i = 0; i < modules_.size(); ++i) {
+    ET_CHECK_MSG(
+        methods_meta[i].ok(),
+        "Failed to get method_meta 0x%x",
+        static_cast<uint32_t>(methods_meta[i].error()));
+  }
+
+  ET_CHECK_MSG(
+      !(prefill_forward_name_.empty()), "prefill forward name is empty");
+
+  IO* ptr = static_cast<IO*>(data_ptr_.get());
+  std::unordered_map<std::string, size_t> io_bytes_map = get_io_bytes();
+
+  int32_t cache_len = methods_meta[0]->input_tensor_meta(0)->sizes()[1];
+  // [I]: pre_input_tokens
+  Result<TensorInfo> prefill_input_toks = methods_meta[0]->input_tensor_meta(0);
+  prefill_input_toks_ = std::make_unique<TensorImpl>(
+      prefill_input_toks->scalar_type(),
+      prefill_input_toks->sizes().size(),
+      const_cast<TensorImpl::SizesType*>(prefill_input_toks->sizes().data()),
+      ptr->prefill_input_toks,
+      const_cast<TensorImpl::DimOrderType*>(
+          prefill_input_toks->dim_order().data()));
+  input_tensors_[prefill_forward_name_][0].push_back(prefill_input_toks_.get());
+  ptr->add_custom_mem_info(
+      ptr->prefill_input_toks,
+      io_bytes_map["prefill_input_toks_bytes"],
+      executorch::aten::ScalarType::Int,
+      prefill_input_toks.get());
+
+  // [I]: prefill_attn_mask
+  for (int i = 0; i < cache_len; ++i) {
+    for (int j = 0; j < cache_len; ++j) {
+      if (i < j) {
+        ptr->prefill_atten_mask[i * cache_len + j] = 0;
+      } else {
+        ptr->prefill_atten_mask[i * cache_len + j] = 65535;
+      }
+    }
+  }
+  Result<TensorInfo> prefill_atten_mask = methods_meta[0]->input_tensor_meta(1);
+  prefill_attn_mask_ = std::make_unique<TensorImpl>(
+      prefill_atten_mask->scalar_type(),
+      prefill_atten_mask->sizes().size(),
+      const_cast<TensorImpl::SizesType*>(prefill_atten_mask->sizes().data()),
+      ptr->prefill_atten_mask,
+      const_cast<TensorImpl::DimOrderType*>(
+          prefill_atten_mask->dim_order().data()));
+  input_tensors_[prefill_forward_name_][0].push_back(prefill_attn_mask_.get());
+  ptr->add_custom_mem_info(
+      ptr->prefill_atten_mask,
+      io_bytes_map["prefill_atten_mask_bytes"],
+      executorch::aten::ScalarType::Bits16,
+      prefill_atten_mask.get());
+
+  // [O]: logits
+  int logit_index = 0;
+  Result<TensorInfo> logits = methods_meta[0]->output_tensor_meta(0);
+  prefill_logits_ = std::make_unique<TensorImpl>(
+      logits->scalar_type(),
+      logits->sizes().size(),
+      const_cast<TensorImpl::SizesType*>(logits->sizes().data()),
+      ptr->prefill_logits,
+      const_cast<TensorImpl::DimOrderType*>(logits->dim_order().data()));
+  output_tensors_[prefill_forward_name_][modules_.size() - 1].push_back(
+      prefill_logits_.get());
+  ptr->add_custom_mem_info(
+      ptr->prefill_logits,
+      io_bytes_map["prefill_logits_bytes"],
+      executorch::aten::ScalarType::Bits16,
+      logits.get());
+
+  // [O] kv_cache
+  int index = 1;
+  size_t layered_head_count = num_layers_ * num_heads_;
+  for (int offset = 0, shard_index = 0; shard_index < modules_.size();
+       offset += shard_layers_[shard_index], shard_index++) {
+    for (int cache_group = 0; cache_group < 2; ++cache_group) {
+      for (int layer = 0; layer < shard_layers_[shard_index]; ++layer) {
+        for (int head = 0; head < num_heads_; ++head, ++index) {
+          Result<TensorInfo> kv_cache =
+              methods_meta[shard_index]->output_tensor_meta(index);
+          std::vector<std::unique_ptr<TensorImpl>>& cache =
+              (cache_group == 0 ? k_cache_out_[prefill_forward_name_]
+                                : v_cache_out_[prefill_forward_name_]);
+          void* cache_ptr = (cache_group == 0)
+              ? ptr->k_cache[layer + offset][head]
+              : ptr->v_cache[layer + offset][head];
+          cache.emplace_back(std::make_unique<TensorImpl>(
+              kv_cache->scalar_type(),
+              kv_cache->sizes().size(),
+              const_cast<TensorImpl::SizesType*>(kv_cache->sizes().data()),
+              cache_ptr,
+              const_cast<TensorImpl::DimOrderType*>(
+                  kv_cache->dim_order().data())));
+          ptr->add_custom_mem_info(
+              cache_ptr,
+              io_bytes_map["cache_in_bytes"] / layered_head_count,
+              executorch::aten::ScalarType::Byte,
+              kv_cache.get());
+          output_tensors_[prefill_forward_name_][shard_index].push_back(
+              cache.back().get());
+        }
+      }
+    }
+  }
+}
+
+void SmartMaskIoMgr::update_prefill_to_kv_io(
+    int64_t cur_token,
+    int64_t pos,
+    std::vector<std::vector<Tensor>>& output_tensors) {
+  IO* ptr = static_cast<IO*>(data_ptr_.get());
+
+  *ptr->input_tok =
+      use_int64_token_ ? cur_token : static_cast<int32_t>(cur_token);
+  *ptr->input_pos = static_cast<int32_t>(pos);
+  // pos means the cur_token pos
+  for (int i = 0; i < pos; i++) {
+    ptr->kv_attention_mask[i] = 65535;
+  }
+
+  // Update K is enough, copy from last to prevent from overwriting values
+  size_t copied_size = prefill_cache_len_ * sizeof(uint8_t);
+  for (int l = 0; l < num_layers_; l++) {
+    for (int h = 0; h < num_heads_; h++) {
+      uint8_t* k_cache = ptr->k_cache[l][h];
+      for (int hd = head_dim_ - 1; hd > -1; hd--) {
+        memcpy(
+            k_cache + (kv_cache_len_ * hd),
+            k_cache + (prefill_cache_len_ * hd),
+            copied_size);
+      }
+    }
+  }
+}
+
+void SmartMaskIoMgr::update_prefill_io(
+    int64_t cur_token,
+    int64_t pos,
+    std::vector<std::vector<Tensor>>& output_tensors) {
+  (void)output_tensors;
+  IO* ptr = static_cast<IO*>(data_ptr_.get());
+  // Support CPU 4-bit embedding, which requires int64 input.
+  // However, for QNN embedding, only int32 input is needed.
+  // Therefore, we need to cast to the correct type to write the data.
+  if (use_int64_token_) {
+    ptr->prefill_input_toks[pos] = cur_token;
+  } else {
+    int32_t* prefill_input_toks_ptr =
+        reinterpret_cast<int32_t*>(ptr->prefill_input_toks);
+    prefill_input_toks_ptr[pos] = static_cast<int32_t>(cur_token);
+  }
+}
+
+void SmartMaskIoMgr::fill_prefill_toks(std::vector<uint64_t>& prompt_tokens) {
+  IO* ptr = static_cast<IO*>(get_mutable_ptr());
+  for (int i = 0; i < prompt_tokens.size(); i++) {
+    // Support CPU 4-bit embedding, which requires int64 input.
+    // However, for QNN embedding, only int32 input is needed.
+    // Therefore, we need to cast to the correct type to write the data.
+    if (use_int64_token_) {
+      ptr->prefill_input_toks[i] = prompt_tokens[i];
+    } else {
+      int32_t* prefill_input_toks_ptr =
+          reinterpret_cast<int32_t*>(ptr->prefill_input_toks);
+      prefill_input_toks_ptr[i] = static_cast<int32_t>(prompt_tokens[i]);
+    }
+  }
+}
+
+void SmartMaskIoMgr::fill_kv_tok_mask(int64_t pos, int64_t cur_token) {
+  IO* ptr = static_cast<IO*>(get_mutable_ptr());
+  *ptr->input_tok =
+      use_int64_token_ ? cur_token : static_cast<int32_t>(cur_token);
+  ptr->kv_attention_mask[kv_cache_len_] = 65535;
+}
+
+} // namespace example
diff --git a/examples/qualcomm/oss_scripts/llama/runner/io_manager.h b/examples/qualcomm/oss_scripts/llama/runner/io_manager.h
new file mode 100644
index 0000000000..3a59ab6924
--- /dev/null
+++ b/examples/qualcomm/oss_scripts/llama/runner/io_manager.h
@@ -0,0 +1,292 @@
+/*
+ * Copyright (c) Qualcomm Innovation Center, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <cstddef>
+#include <cstdint>
+#include <future>
+#include <limits>
+#include <memory>
+#include <thread>
+#include <vector>
+
+#include <executorch/backends/qualcomm/runtime/QnnExecuTorch.h>
+#include <executorch/extension/module/module.h>
+#include <executorch/runtime/executor/method_meta.h>
+
+namespace example {
+
+enum EvalMode {
+  kPrefill = 0,
+  kKVCached,
+  kHybrid,
+  kUnsupported,
+};
+class IoMgrBase {
+ public:
+  IoMgrBase(
+      std::vector<std::shared_ptr<executorch::extension::Module>>& modules);
+  virtual ~IoMgrBase();
+  virtual void init_io() = 0;
+  virtual void prepare_prefill_io(
+      const std::vector<
+          executorch::runtime::Result<executorch::runtime::MethodMeta>>&
+          methods_meta) = 0;
+  virtual void prepare_kv_io(
+      const std::vector<
+          executorch::runtime::Result<executorch::runtime::MethodMeta>>&
+          methods_meta) = 0;
+  virtual void fill_prefill_toks(std::vector<uint64_t>& prompt_tokens) = 0;
+  virtual void fill_kv_tok_mask(int64_t pos, int64_t cur_token) = 0;
+  virtual void update_prefill_to_kv_io(
+      int64_t cur_token,
+      int64_t pos,
+      std::vector<std::vector<executorch::aten::Tensor>>& output_tensors) = 0;
+  virtual void update_kv_io(
+      int64_t cur_token,
+      int64_t pos,
+      std::vector<std::vector<executorch::aten::Tensor>>& output_tensors) = 0;
+  virtual void update_prefill_io(
+      int64_t cur_token,
+      int64_t pos,
+      std::vector<std::vector<executorch::aten::Tensor>>& output_tensors) = 0;
+  void* get_mutable_ptr();
+  std::vector<executorch::aten::Tensor> get_input_tensors(
+      int shard_index,
+      const std::string& method_name);
+  std::vector<executorch::aten::Tensor> get_output_tensors(
+      int shard_index,
+      const std::string& method_name);
+
+ protected:
+  std::unique_ptr<void, void (*)(void*)> data_ptr_;
+  std::unordered_map<
+      std::string,
+      std::vector<std::vector<executorch::aten::TensorImpl*>>>
+      input_tensors_;
+  std::unordered_map<
+      std::string,
+      std::vector<std::vector<executorch::aten::TensorImpl*>>>
+      output_tensors_;
+  std::vector<std::shared_ptr<executorch::extension::Module>> modules_;
+};
+
+class ShiftPointerIoMgr : public IoMgrBase {
+ public:
+  ShiftPointerIoMgr(
+      std::vector<std::shared_ptr<executorch::extension::Module>>& modules,
+      int32_t prefill_cache_len,
+      int32_t kv_cache_len,
+      int32_t vocab_size,
+      int32_t num_layers,
+      int32_t head_dim,
+      int32_t num_heads,
+      EvalMode eval_mode,
+      const std::string& prefill_forward_name,
+      const std::string& kv_forward_name,
+      const bool use_int64_token);
+
+  void init_io() override;
+  void prepare_prefill_io(
+      const std::vector<
+          executorch::runtime::Result<executorch::runtime::MethodMeta>>&
+          methods_meta) override;
+  void prepare_kv_io(
+      const std::vector<
+          executorch::runtime::Result<executorch::runtime::MethodMeta>>&
+          methods_meta) override;
+  void fill_prefill_toks(std::vector<uint64_t>& prompt_tokens) override;
+  void fill_kv_tok_mask(int64_t pos, int64_t cur_token) override;
+  void update_prefill_to_kv_io(
+      int64_t cur_token,
+      int64_t pos,
+      std::vector<std::vector<executorch::aten::Tensor>>& output_tensors)
+      override;
+  void update_kv_io(
+      int64_t cur_token,
+      int64_t pos,
+      std::vector<std::vector<executorch::aten::Tensor>>& output_tensors)
+      override;
+  void update_prefill_io(
+      int64_t cur_token,
+      int64_t pos,
+      std::vector<std::vector<executorch::aten::Tensor>>& output_tensors)
+      override;
+  struct IO {
+    int64_t input_tok;
+    int32_t input_pos;
+    std::vector<std::vector<std::vector<uint8_t>>> k_cache;
+    std::vector<std::vector<uint8_t>> v_cache;
+    std::vector<std::vector<uint8_t>> k_cache_out;
+    std::vector<uint16_t> kv_attention_mask;
+    std::vector<uint16_t> kv_logits;
+    std::vector<int64_t> prefill_input_toks;
+    std::vector<uint16_t> prefill_atten_mask;
+    std::vector<uint16_t> prefill_logits;
+  };
+
+ private:
+  std::unique_ptr<executorch::aten::TensorImpl> input_tok_;
+  std::unique_ptr<executorch::aten::TensorImpl> input_pos_;
+  std::unique_ptr<executorch::aten::TensorImpl> hidden_state_;
+  std::unique_ptr<executorch::aten::TensorImpl> attention_mask_;
+  std::unique_ptr<executorch::aten::TensorImpl> prefill_input_toks_;
+  std::unique_ptr<executorch::aten::TensorImpl> prefill_attn_mask_;
+  std::unique_ptr<executorch::aten::TensorImpl> prefill_logits_;
+  std::unordered_map<
+      std::string,
+      std::vector<std::unique_ptr<executorch::aten::TensorImpl>>>
+      k_cache_in_;
+  std::unordered_map<
+      std::string,
+      std::vector<std::unique_ptr<executorch::aten::TensorImpl>>>
+      v_cache_in_;
+  std::unordered_map<
+      std::string,
+      std::vector<std::unique_ptr<executorch::aten::TensorImpl>>>
+      k_cache_out_;
+  std::unordered_map<
+      std::string,
+      std::vector<std::unique_ptr<executorch::aten::TensorImpl>>>
+      v_cache_out_;
+  std::unique_ptr<executorch::aten::TensorImpl> kv_logits_;
+  std::vector<int> shard_layers_;
+  int32_t kv_cache_len_{0};
+  int32_t prefill_cache_len_{0};
+  int32_t vocab_size_;
+  int32_t num_layers_;
+  int32_t head_dim_;
+  int32_t num_heads_;
+  EvalMode eval_mode_;
+  std::string prefill_forward_name_;
+  std::string kv_forward_name_;
+  const bool use_int64_token_{false};
+};
+
+class SmartMaskIoMgr : public IoMgrBase {
+ public:
+  SmartMaskIoMgr(
+      std::vector<std::shared_ptr<executorch::extension::Module>>& modules,
+      int32_t prefill_cache_len,
+      int32_t kv_cache_len,
+      int32_t vocab_size,
+      int32_t num_layers,
+      int32_t head_dim,
+      int32_t num_heads,
+      EvalMode eval_mode,
+      const std::string& prefill_forward_name,
+      const std::string& kv_forward_name,
+      const bool use_int64_token);
+
+  void init_io() override;
+  void prepare_prefill_io(
+      const std::vector<
+          executorch::runtime::Result<executorch::runtime::MethodMeta>>&
+          methods_meta) override;
+  void prepare_kv_io(
+      const std::vector<
+          executorch::runtime::Result<executorch::runtime::MethodMeta>>&
+          methods_meta) override;
+  void fill_prefill_toks(std::vector<uint64_t>& prompt_tokens) override;
+  void fill_kv_tok_mask(int64_t pos, int64_t cur_token) override;
+  void update_prefill_to_kv_io(
+      int64_t cur_token,
+      int64_t pos,
+      std::vector<std::vector<executorch::aten::Tensor>>& output_tensors)
+      override;
+  void update_kv_io(
+      int64_t cur_token,
+      int64_t pos,
+      std::vector<std::vector<executorch::aten::Tensor>>& output_tensors)
+      override;
+  void update_prefill_io(
+      int64_t cur_token,
+      int64_t pos,
+      std::vector<std::vector<executorch::aten::Tensor>>& output_tensors)
+      override;
+
+  std::unordered_map<std::string, size_t> get_io_elements();
+  std::unordered_map<std::string, size_t> get_io_bytes();
+
+  struct IO {
+    void* shared_buffer_base;
+    int64_t* input_tok;
+    int32_t* input_pos;
+    // layer -> head -> head_dim * seq_len
+    std::vector<std::vector<uint8_t*>> k_cache;
+    std::vector<std::vector<uint8_t*>> v_cache;
+    // layer -> head -> head_dim
+    std::vector<std::vector<uint8_t*>> k_cache_out;
+    std::vector<std::vector<uint8_t*>> v_cache_out;
+    // max_seq_len
+    uint16_t* kv_attention_mask;
+    // vocab_size
+    uint16_t* kv_logits;
+    int64_t* prefill_input_toks;
+    // prefill_cache_len_ ^ 2
+    uint16_t* prefill_atten_mask;
+    // vocab_size * prefill_cache_len_
+    uint16_t* prefill_logits;
+
+    size_t num_layers_;
+    size_t num_heads_;
+    size_t head_dim_;
+    std::unordered_map<std::byte*, size_t> io_pos_map;
+    ~IO() {
+      QnnExecuTorchFreeCustomMem(shared_buffer_base);
+    }
+    void init_io_ptrs(
+        void* shared_buffer_ptr,
+        std::unordered_map<std::string, size_t>& io_bytes_map);
+    void add_custom_mem_info(
+        void* ptr,
+        size_t nbytes,
+        executorch::aten::ScalarType scalar_type,
+        executorch::runtime::TensorInfo& tensor_info);
+  };
+
+ private:
+  std::unique_ptr<executorch::aten::TensorImpl> input_tok_;
+  std::unique_ptr<executorch::aten::TensorImpl> input_pos_;
+  std::unique_ptr<executorch::aten::TensorImpl> hidden_state_;
+  std::unique_ptr<executorch::aten::TensorImpl> attention_mask_;
+  std::unique_ptr<executorch::aten::TensorImpl> prefill_input_toks_;
+  std::unique_ptr<executorch::aten::TensorImpl> prefill_attn_mask_;
+  std::unique_ptr<executorch::aten::TensorImpl> prefill_logits_;
+  std::unordered_map<
+      std::string,
+      std::vector<std::unique_ptr<executorch::aten::TensorImpl>>>
+      k_cache_in_;
+  std::unordered_map<
+      std::string,
+      std::vector<std::unique_ptr<executorch::aten::TensorImpl>>>
+      v_cache_in_;
+  std::unordered_map<
+      std::string,
+      std::vector<std::unique_ptr<executorch::aten::TensorImpl>>>
+      k_cache_out_;
+  std::unordered_map<
+      std::string,
+      std::vector<std::unique_ptr<executorch::aten::TensorImpl>>>
+      v_cache_out_;
+  std::unique_ptr<executorch::aten::TensorImpl> kv_logits_;
+  std::vector<int> shard_layers_;
+  int32_t kv_cache_len_{0};
+  int32_t prefill_cache_len_{0};
+  int32_t vocab_size_;
+  int32_t num_layers_;
+  int32_t head_dim_;
+  int32_t num_heads_;
+  EvalMode eval_mode_;
+  std::string prefill_forward_name_;
+  std::string kv_forward_name_;
+  const bool use_int64_token_{false};
+};
+
+} // namespace example
diff --git a/examples/qualcomm/oss_scripts/llama3_2/runner/runner.cpp b/examples/qualcomm/oss_scripts/llama/runner/runner.cpp
similarity index 85%
rename from examples/qualcomm/oss_scripts/llama3_2/runner/runner.cpp
rename to examples/qualcomm/oss_scripts/llama/runner/runner.cpp
index 02a53861b8..4b45863147 100644
--- a/examples/qualcomm/oss_scripts/llama3_2/runner/runner.cpp
+++ b/examples/qualcomm/oss_scripts/llama/runner/runner.cpp
@@ -10,7 +10,7 @@
 // logic. The module takes in a string as input and emits a string as output.
 
 #include <executorch/examples/models/llama/tokenizer/llama_tiktoken.h>
-#include <executorch/examples/qualcomm/oss_scripts/llama3_2/runner/runner.h>
+#include <executorch/examples/qualcomm/oss_scripts/llama/runner/runner.h>
 #include <executorch/extension/evalue_util/print_evalue.h>
 #include <executorch/extension/llm/runner/util.h>
 #include <executorch/extension/llm/tokenizer/bpe_tokenizer.h>
@@ -43,21 +43,23 @@ Runner::Runner(
     const float logits_scale,
     const int32_t logits_offset,
     const float temperature,
-    const int eval_mode)
+    const int eval_mode,
+    const std::string& kv_updator)
     : n_bos_(1),
       n_eos_(1),
       tokenizer_path_(tokenizer_path),
       logits_scale_(logits_scale),
       logits_offset_(logits_offset),
       temperature_(temperature),
-      eval_mode_(static_cast<EvalMode>(eval_mode)) {
+      eval_mode_(static_cast<EvalMode>(eval_mode)),
+      kv_updator_(kv_updator) {
   for (size_t i = 0; i < models_path.size(); ++i) {
     modules_.push_back(std::make_shared<Module>(
         models_path[i], Module::LoadMode::MmapUseMlockIgnoreErrors));
     ET_LOG(Info, "creating module: model_path=%s", models_path[i].c_str());
   }
   ET_LOG(Info, "creating runner: tokenizer_path=%s", tokenizer_path_.c_str());
-  ET_LOG(Info, "eval mode=%d", eval_mode);
+  ET_LOG(Info, "eval mode=%d", eval_mode_);
 }
 
 bool Runner::is_loaded() const {
@@ -123,33 +125,53 @@ Error Runner::load() {
   int64_t head_dim = method_meta.output_tensor_meta(1)->sizes()[1]; // k_cache
   int64_t num_heads = (method_meta.num_outputs() - 1) / (num_layers * 2);
   vocab_size_ = method_meta.output_tensor_meta(0)->sizes()[2]; // logit_tensor
+  use_int64_token_ = method_meta.input_tensor_meta(0)->scalar_type() ==
+      executorch::aten::ScalarType::Long;
   ET_CHECK_MSG(num_layers != -1, "Could not retrieve num layers");
 
-  io_mem_ = std::make_unique<HybridMemory>(
-      modules_,
-      prefill_cache_len_,
-      kv_cache_len_,
-      vocab_size_,
-      num_layers,
-      head_dim,
-      num_heads,
-      eval_mode_,
-      prefill_forward_name_,
-      kv_forward_name_);
+  if (kv_updator_ == "SmartMask") {
+    io_mgr_ = std::make_unique<SmartMaskIoMgr>(
+        modules_,
+        prefill_cache_len_,
+        kv_cache_len_,
+        vocab_size_,
+        num_layers,
+        head_dim,
+        num_heads,
+        eval_mode_,
+        prefill_forward_name_,
+        kv_forward_name_,
+        use_int64_token_);
+  } else if (kv_updator_ == "ShiftPointer") {
+    io_mgr_ = std::make_unique<ShiftPointerIoMgr>(
+        modules_,
+        prefill_cache_len_,
+        kv_cache_len_,
+        vocab_size_,
+        num_layers,
+        head_dim,
+        num_heads,
+        eval_mode_,
+        prefill_forward_name_,
+        kv_forward_name_,
+        use_int64_token_);
+  } else {
+    ET_LOG(Error, "Using an unknown updator %s", kv_updator_.c_str());
+  }
   ET_LOG(Info, "creating io_memory");
 
   // prepare io
-  io_mem_->init_io();
+  io_mgr_->init_io();
   switch (eval_mode_) {
     case EvalMode::kPrefill:
-      io_mem_->prepare_prefill_io(get_methods_meta(prefill_forward_name_));
+      io_mgr_->prepare_prefill_io(get_methods_meta(prefill_forward_name_));
       break;
     case EvalMode::kKVCached:
-      io_mem_->prepare_kv_io(get_methods_meta(kv_forward_name_));
+      io_mgr_->prepare_kv_io(get_methods_meta(kv_forward_name_));
       break;
     case EvalMode::kHybrid:
-      io_mem_->prepare_prefill_io(get_methods_meta(prefill_forward_name_));
-      io_mem_->prepare_kv_io(get_methods_meta(kv_forward_name_));
+      io_mgr_->prepare_prefill_io(get_methods_meta(prefill_forward_name_));
+      io_mgr_->prepare_kv_io(get_methods_meta(kv_forward_name_));
       break;
     case EvalMode::kUnsupported:
       ET_CHECK_MSG(false, "unsupported mode");
@@ -168,12 +190,14 @@ Error Runner::load() {
     // llama2 tokenizer
     tokenizer_ = std::make_unique<executorch::extension::llm::BPETokenizer>();
     err = tokenizer_->load(tokenizer_path_);
+    llama_version_ = LlamaVersion::kLlama2;
     ET_CHECK_MSG(
         err == Error::Ok,
         "failed to load tokenizer %s",
         tokenizer_path_.c_str());
   } else {
     eos_id_.insert(tokenizer_->encode("<|eot_id|>", 0, 0).get()[0]);
+    llama_version_ = LlamaVersion::kLlama3;
   }
   bos_id_ = tokenizer_->bos_tok();
   eos_id_.insert(tokenizer_->eos_tok());
@@ -217,8 +241,7 @@ int32_t Runner::logitsToToken(const Tensor& logits_tensor, int64_t pos) {
 
   // offset to the meaningful logit we want.
   if (logits_tensor.sizes().data()[1] > 1) {
-    auto vocab_size = logits_tensor.size(2);
-    logits_last += pos * vocab_size;
+    logits_last += pos * vocab_size_;
   }
 
   // dequantize
@@ -254,9 +277,9 @@ Error Runner::generate(
     for (auto method_name : method_names_) {
       for (int i = 0; i < modules_.size(); ++i) {
         input_tensors[method_name].emplace_back(
-            io_mem_->get_input_tensors(i, method_name));
+            io_mgr_->get_input_tensors(i, method_name));
         output_tensors[method_name].emplace_back(
-            io_mem_->get_output_tensors(i, method_name));
+            io_mgr_->get_output_tensors(i, method_name));
         for (size_t j = 0; j < output_tensors[method_name][i].size(); ++j) {
           ET_CHECK_MSG(
               modules_[i]->set_output(
@@ -277,17 +300,27 @@ Error Runner::generate(
 
   ET_CHECK_MSG(!prompt.empty(), "prompt cannot be null");
 
-  if (!system_prompt.empty()) {
-    prompt_.append("<|start_header_id|>system<|end_header_id|>\n\n");
-    prompt_.append(system_prompt);
-    prompt_.append("<|eot_id|>");
-  }
-  prompt_.append("<|start_header_id|>user<|end_header_id|>\n\n");
-  prompt_.append(prompt);
-  prompt_.append("<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n");
-
-  if (token_callback) {
-    token_callback("<|begin_of_text|>");
+  switch (llama_version_) {
+    case LlamaVersion::kLlama2:
+      prompt_.append(prompt);
+      break;
+    case LlamaVersion::kLlama3:
+      if (!system_prompt.empty()) {
+        prompt_.append("<|start_header_id|>system<|end_header_id|>\n\n");
+        prompt_.append(system_prompt);
+        prompt_.append("<|eot_id|>");
+      }
+      prompt_.append("<|start_header_id|>user<|end_header_id|>\n\n");
+      prompt_.append(prompt);
+      prompt_.append(
+          "<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n");
+      if (token_callback) {
+        token_callback("<|begin_of_text|>");
+      }
+      break;
+    default:
+      ET_CHECK_MSG(false, "unsupported llama version");
+      break;
   }
 
   int max_seq_len = std::max(prefill_cache_len_, kv_cache_len_) + 1;
@@ -316,16 +349,11 @@ Error Runner::generate(
   }
 
   int64_t pos = 0, prev_token, cur_token = prompt_tokens[0];
-  HybridMemory::IO* ptr =
-      static_cast<HybridMemory::IO*>(io_mem_->get_mutable_ptr());
-
+  if (token_callback) {
+    token_callback(prompt_);
+  }
   auto prefill_execute = [&](const std::string& method_name) {
-    for (int i = 0; i < num_prompt_tokens; i++) {
-      ptr->prefill_input_toks[i] = static_cast<int32_t>(prompt_tokens[i]);
-    }
-    if (token_callback) {
-      token_callback(prompt_);
-    }
+    io_mgr_->fill_prefill_toks(prompt_tokens);
 
     pos = num_prompt_tokens - 1;
     cur_token = prompt_tokens[pos];
@@ -338,7 +366,7 @@ Error Runner::generate(
       cur_token = logitsToToken(logits_tensor, pos);
       stats_.aggregate_sampling_time_ms += time_in_ms() - sample_start_time_ms;
 
-      io_mem_->update_prefill_io(cur_token, ++pos, output_tensors[method_name]);
+      io_mgr_->update_prefill_io(cur_token, ++pos, output_tensors[method_name]);
       auto piece_res = tokenizer_->decode(prev_token, cur_token);
       ET_CHECK(piece_res.ok());
       if (token_callback) {
@@ -362,8 +390,7 @@ Error Runner::generate(
   };
 
   auto kv_execute = [&](const std::string& method_name) {
-    ptr->input_tok = static_cast<int32_t>(cur_token);
-    ptr->kv_attention_mask[kv_cache_len_] = 65535;
+    io_mgr_->fill_kv_tok_mask(pos, cur_token);
     while (pos < seq_len - 1) {
       // inference
       run_model_step(method_name, inputs[method_name]);
@@ -385,11 +412,11 @@ Error Runner::generate(
       if (pos < num_prompt_tokens - 1) {
         cur_token = prompt_tokens[pos + 1];
       }
-      io_mem_->update_kv_io(cur_token, ++pos, output_tensors[method_name]);
+      io_mgr_->update_kv_io(cur_token, ++pos, output_tensors[method_name]);
       auto piece_res = tokenizer_->decode(prev_token, cur_token);
       ET_CHECK(piece_res.ok());
 
-      if (token_callback) {
+      if (token_callback && pos >= num_prompt_tokens) {
         token_callback(piece_res.get().c_str());
       }
 
@@ -409,7 +436,7 @@ Error Runner::generate(
       break;
     case EvalMode::kHybrid:
       prefill_execute(prefill_forward_name_);
-      io_mem_->update_prefill_to_kv_io(
+      io_mgr_->update_prefill_to_kv_io(
           cur_token, pos, output_tensors[kv_forward_name_]);
       kv_execute(kv_forward_name_);
       break;
diff --git a/examples/qualcomm/oss_scripts/llama3_2/runner/runner.h b/examples/qualcomm/oss_scripts/llama/runner/runner.h
similarity index 91%
rename from examples/qualcomm/oss_scripts/llama3_2/runner/runner.h
rename to examples/qualcomm/oss_scripts/llama/runner/runner.h
index 75ad640219..b6ba1360bf 100644
--- a/examples/qualcomm/oss_scripts/llama3_2/runner/runner.h
+++ b/examples/qualcomm/oss_scripts/llama/runner/runner.h
@@ -17,7 +17,7 @@
 #include <string>
 #include <unordered_map>
 
-#include <executorch/examples/qualcomm/oss_scripts/llama3_2/runner/io_memory.h>
+#include <executorch/examples/qualcomm/oss_scripts/llama/runner/io_manager.h>
 #include <executorch/extension/llm/sampler/sampler.h>
 #include <executorch/extension/llm/tokenizer/tokenizer.h>
 #include <executorch/extension/module/module.h>
@@ -32,7 +32,8 @@ class Runner {
       const float logits_scale,
       const int32_t logits_offset,
       const float temperature,
-      const int eval_mode);
+      const int eval_mode,
+      const std::string& kv_updator);
 
   struct Stats {
     // Scaling factor for timestamps - in this case, we use ms.
@@ -73,6 +74,10 @@ class Runner {
   get_methods_meta(std::string& method_name);
 
  private:
+  enum LlamaVersion {
+    kLlama2 = 0,
+    kLlama3,
+  };
   template <typename T>
   T getMetadataHelper(std::string method_name, T default_val);
   int32_t logitsToToken(
@@ -99,11 +104,14 @@ class Runner {
   std::unique_ptr<executorch::extension::llm::Tokenizer> tokenizer_;
   std::unique_ptr<executorch::extension::llm::Sampler> sampler_;
   Stats stats_;
-  std::unique_ptr<Memory> io_mem_;
+  std::unique_ptr<IoMgrBase> io_mgr_;
   EvalMode eval_mode_;
+  bool use_int64_token_{false};
   std::string prefill_forward_name_;
   std::string kv_forward_name_;
   std::vector<std::string> method_names_;
+  LlamaVersion llama_version_;
+  std::string kv_updator_;
 };
 
 } // namespace example
diff --git a/examples/qualcomm/oss_scripts/llama/targets.bzl b/examples/qualcomm/oss_scripts/llama/targets.bzl
new file mode 100644
index 0000000000..9780da0369
--- /dev/null
+++ b/examples/qualcomm/oss_scripts/llama/targets.bzl
@@ -0,0 +1,54 @@
+load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "get_oss_build_kwargs", "runtime")
+load("@fbsource//xplat/executorch/backends/qualcomm/qnn_version.bzl", "get_qnn_library_verision")
+
+def define_common_targets():
+    runtime.cxx_library(
+        name = "runner_lib",
+        srcs = glob(
+            [
+                "runner/*.cpp",
+            ],
+        ),
+        exported_headers = glob([
+            "runner/*.h",
+        ]),
+        compiler_flags = [
+            "-Wno-global-constructors",
+            "-Wunused-command-line-argument",
+        ],
+        deps = [
+            "//executorch/extension/llm/runner:stats",
+            "//executorch/extension/tensor:tensor",
+            "fbsource//third-party/qualcomm/qnn/qnn-{0}:api".format(get_qnn_library_verision()),
+        ],
+        exported_deps = [
+            "//executorch/extension/module:module",
+            "//executorch/extension/llm/sampler:sampler",
+            "//executorch/examples/models/llama/tokenizer:tiktoken",
+            "//executorch/extension/llm/tokenizer:bpe_tokenizer",
+            "//executorch/extension/evalue_util:print_evalue",
+            "//executorch/backends/qualcomm/runtime:runtime",
+        ],
+        external_deps = [
+            "gflags",
+        ],
+        **get_oss_build_kwargs()
+    )
+
+    runtime.cxx_binary(
+        name = "qnn_llama_runner",
+        srcs = [
+            "qnn_llama_runner.cpp",
+        ],
+        compiler_flags = [
+            "-Wno-global-constructors",
+        ],
+        deps = [
+            ":runner_lib",
+            "//executorch/extension/threadpool:threadpool", # this depeneency shouldn't be needed. But it fails to build..
+        ],
+        external_deps = [
+            "gflags",
+        ],
+        **get_oss_build_kwargs()
+    )
diff --git a/examples/qualcomm/oss_scripts/llama2/CMakeLists.txt b/examples/qualcomm/oss_scripts/llama2/CMakeLists.txt
deleted file mode 100644
index 61a2ecda56..0000000000
--- a/examples/qualcomm/oss_scripts/llama2/CMakeLists.txt
+++ /dev/null
@@ -1,38 +0,0 @@
-# Copyright (c) Qualcomm Innovation Center, Inc.
-# All rights reserved
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-set(_qnn_llama_runner__srcs ${_llama_runner__srcs})
-
-# preprocess qnn llama runner src files
-list(TRANSFORM _qnn_llama_runner__srcs PREPEND "${EXECUTORCH_SOURCE_DIR}/")
-list(FILTER _qnn_llama_runner__srcs EXCLUDE REGEX ".*(/runner/).*")
-list(
-  PREPEND
-  _qnn_llama_runner__srcs
-  ${CMAKE_CURRENT_LIST_DIR}/qnn_llama_runner.cpp
-  ${CMAKE_CURRENT_LIST_DIR}/runner/runner.cpp
-  ${CMAKE_CURRENT_LIST_DIR}/runner/runner.h
-)
-
-# build qnn llama runner
-add_executable(qnn_llama_runner ${_qnn_llama_runner__srcs})
-target_include_directories(
-  qnn_llama_runner PUBLIC ${_common_include_directories}
-)
-target_link_libraries(
-  qnn_llama_runner
-  qnn_executorch_backend
-  full_portable_ops_lib
-  extension_data_loader
-  extension_module
-  extension_tensor
-  gflags
-  re2::re2
-)
-target_compile_options(qnn_llama_runner PUBLIC ${_common_compile_options})
-set_target_properties(
-  qnn_llama_runner PROPERTIES LINK_FLAGS "-Wl,-rpath='$ORIGIN'"
-)
diff --git a/examples/qualcomm/oss_scripts/llama2/README.md b/examples/qualcomm/oss_scripts/llama2/README.md
deleted file mode 100644
index d83902a6de..0000000000
--- a/examples/qualcomm/oss_scripts/llama2/README.md
+++ /dev/null
@@ -1,39 +0,0 @@
-# Summary
-
-## Overview
-This file provides you the instructions to run LLAMA2 with different parameters via Qualcomm HTP backend. Following settings support for Stories 110M
-
-Please check corresponding section for more information.
-
-## Stories 110M
-This example demonstrates how to run a smaller LLAMA2, stories110M on mobile via Qualcomm HTP backend. Model architecture is fine-tuned specifically for HTP to accelerate the performance. Weight is quantized via PTQ quantization to fit the model on a phone.
-
-### Instructions
-#### Step 1: Setup
-1. Follow the [tutorial](https://pytorch.org/executorch/main/getting-started-setup) to set up ExecuTorch.
-2. Follow the [tutorial](https://pytorch.org/executorch/stable/build-run-qualcomm-ai-engine-direct-backend.html) to build Qualcomm AI Engine Direct Backend.
-
-#### Step2: Prepare Model
-Download and preapre stories110M model
-
-```bash
-# tokenizer.model & stories110M.pt:
-wget "https://huggingface.co/karpathy/tinyllamas/resolve/main/stories110M.pt"
-wget "https://raw.githubusercontent.com/karpathy/llama2.c/master/tokenizer.model"
-
-# tokenizer.bin:
-python -m extension.llm.tokenizer.tokenizer -t tokenizer.model -o tokenizer.bin
-
-# params.json:
-echo '{"dim": 768, "multiple_of": 32, "n_heads": 12, "n_layers": 12, "norm_eps": 1e-05, "vocab_size": 32000}' > params.json
-```
-
-#### Step3: Run default examples
-Default example generates the story based on the given prompt, "Once".
-```bash
-# 16a4w quant:
-python examples/qualcomm/oss_scripts/llama2/llama.py -b build-android -s ${SERIAL_NUM} -m ${SOC_MODEL} --ptq 16a4w --checkpoint stories110M --params params.json --tokenizer_model tokenizer.model --tokenizer_bin tokenizer.bin --prompt "Once"
-```
-
-#### (Note) Customized PTQ data set
-User prompts are used for PTQ calibration data. Take the examples above, the word "Once" is the only word for PTQ. If you want to observe more data during the calibration time. Please add more prompts to the args `--prompt`.
\ No newline at end of file
diff --git a/examples/qualcomm/oss_scripts/llama2/llama.py b/examples/qualcomm/oss_scripts/llama2/llama.py
deleted file mode 100755
index 2a2968362a..0000000000
--- a/examples/qualcomm/oss_scripts/llama2/llama.py
+++ /dev/null
@@ -1,690 +0,0 @@
-# Copyright (c) Qualcomm Innovation Center, Inc.
-# All rights reserved
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-# TODO: reenable pyre after fixing the issues
-# pyre-ignore-all-errors
-
-import codecs
-import getpass
-import json
-import os
-import time
-from multiprocessing.connection import Client
-
-import torch
-from executorch.backends.qualcomm._passes.build_quant_io import BuildQuantIo
-
-from executorch.backends.qualcomm.partition.qnn_partitioner import QnnPartitioner
-
-from executorch.backends.qualcomm.quantizer.quantizer import QuantDtype
-from executorch.backends.qualcomm.serialization.qc_schema import QcomChipset
-from executorch.backends.qualcomm.utils.constants import QCOM_QUANTIZED_IO
-from executorch.backends.qualcomm.utils.utils import (
-    capture_program,
-    convert_linear_to_conv2d,
-    generate_htp_compiler_spec,
-    generate_qnn_executorch_compiler_spec,
-    get_soc_to_chipset_map,
-)
-from executorch.examples.qualcomm.oss_scripts.llama2.model.static_llama import (
-    LlamaModel,
-    ModelArgs,
-)
-from executorch.examples.qualcomm.utils import (
-    make_output_dir,
-    make_quantizer,
-    setup_common_args_and_variables,
-    SimpleADB,
-)
-from executorch.exir import EdgeCompileConfig, EdgeProgramManager
-from executorch.exir.capture._config import ExecutorchBackendConfig
-from executorch.exir.passes.memory_planning_pass import MemoryPlanningPass
-from executorch.extension.llm.export.builder import DType
-
-from sentencepiece import SentencePieceProcessor
-from torch.ao.quantization.observer import MinMaxObserver
-from torch.ao.quantization.quantize_pt2e import convert_pt2e, prepare_pt2e
-
-
-pte_filename = "llama2_qnn"
-
-
-def annotate_matmul_16a8w(gm: torch.fx.GraphModule) -> None:
-    """
-    This function is specific for matmul op 16a8w.
-    """
-
-    from executorch.backends.qualcomm.quantizer.annotators import QUANT_ANNOTATION_KEY
-    from executorch.backends.qualcomm.quantizer.quantizer import (
-        get_16a8w_qnn_ptq_config,
-        get_8a8w_qnn_ptq_config,
-        QuantizationConfig,
-    )
-    from torch.ao.quantization.quantizer import (
-        QuantizationAnnotation,
-        SharedQuantizationSpec,
-    )
-    from torch.fx import Node
-
-    def annotate_matmul(node: Node, quantization_config: QuantizationConfig):
-        input_qspec_map = {}
-        input_act = node.args[0]
-        input_spec = quantization_config.input_activation
-        input_qspec_map[input_act] = input_spec
-
-        input_act1 = node.args[1]
-        input_spec1 = quantization_config.weight
-        input_qspec_map[input_act1] = input_spec1
-
-        node.meta[QUANT_ANNOTATION_KEY] = QuantizationAnnotation(
-            input_qspec_map=input_qspec_map,
-            output_qspec=quantization_config.output_activation,
-            _annotated=True,
-        )
-
-    def annotate_cat(node: Node, quantization_config: QuantizationConfig):
-        input_nodes = node.args[0]
-
-        first_input_node = input_nodes[0]
-        input_qspec_map = {}
-        input_qspec_map[first_input_node] = quantization_config.input_activation
-        share_qparams_with_input_act0_qspec = SharedQuantizationSpec(
-            (first_input_node, node)
-        )
-
-        for input_node in input_nodes[1:]:
-            if input_node not in input_qspec_map:
-                input_qspec_map[input_node] = share_qparams_with_input_act0_qspec
-
-        node.meta[QUANT_ANNOTATION_KEY] = QuantizationAnnotation(
-            input_qspec_map=input_qspec_map,
-            output_qspec=share_qparams_with_input_act0_qspec,
-            _annotated=True,
-        )
-
-    def annotate_single_in_single_out(
-        node: Node, quantization_config: QuantizationConfig
-    ) -> None:
-        input_qspec_map = {}
-        input_act = node.args[0]
-        input_qspec_map[input_act] = quantization_config.input_activation
-
-        node.meta[QUANT_ANNOTATION_KEY] = QuantizationAnnotation(
-            input_qspec_map=input_qspec_map,
-            output_qspec=quantization_config.output_activation,
-            _annotated=True,
-        )
-
-    def annotate_matmul_input1(node: Node):
-        quantization_config_8a8w = get_8a8w_qnn_ptq_config(act_symmetric=True)
-        while isinstance(node, Node) and node.op == "call_function":
-            if node.target in [
-                torch.ops.aten.permute.default,
-                torch.ops.aten.transpose.int,
-            ]:
-                annotate_single_in_single_out(node, quantization_config_8a8w)
-                node = node.args[0]
-            elif node.target == torch.ops.aten.cat.default:
-                annotate_cat(node, quantization_config_8a8w)
-                node = node.args[0][0]
-            else:
-                node = node.args[0]
-
-    quantization_config_16a8w = get_16a8w_qnn_ptq_config()
-
-    for node in gm.graph.nodes:
-        if node.op == "call_function" and node.target == torch.ops.aten.matmul.default:
-            annotate_matmul(node, quantization_config_16a8w)
-            annotate_matmul_input1(node.args[1])
-
-
-def annotate_linear_16a8w_in_affine_layer(gm: torch.fx.GraphModule) -> None:
-    from executorch.backends.qualcomm.quantizer.annotators import QUANT_ANNOTATION_KEY
-    from executorch.backends.qualcomm.quantizer.quantizer import (
-        get_ptq_per_channel_quant_config,
-        QuantizationConfig,
-    )
-    from torch.ao.quantization.quantizer import QuantizationAnnotation
-    from torch.fx import Node
-
-    def annotate_conv2d(node: Node, quantization_config: QuantizationConfig) -> None:
-        input_qspec_map = {}
-        input_act = node.args[0]
-        input_spec = quantization_config.input_activation
-        input_qspec_map[input_act] = input_spec
-
-        weight = node.args[1]
-        input_qspec_map[weight] = quantization_config.weight
-
-        node.meta[QUANT_ANNOTATION_KEY] = QuantizationAnnotation(
-            input_qspec_map=input_qspec_map,
-            output_qspec=quantization_config.output_activation,
-            _annotated=True,
-        )
-
-    quantization_config_16a8w_per_channel = get_ptq_per_channel_quant_config(
-        torch.uint16, weight_dtype=torch.int8
-    )
-    for node in gm.graph.nodes:
-        if node.op == "call_function" and node.target == torch.ops.aten.conv2d.default:
-            if "nn_module_stack" in node.meta:
-                module_values_list = list(node.meta["nn_module_stack"].values())
-                full_qualified_name = module_values_list[0][0]
-                if full_qualified_name == "L['self'].llama.output":
-                    annotate_conv2d(
-                        node, quantization_config=quantization_config_16a8w_per_channel
-                    )
-
-
-def _kv_calibrate(
-    example_inputs,
-    user_prompts,
-    module: torch.fx.GraphModule,
-    tokenizer_model_path="tokenizer.model",
-    max_seq_len=512,
-):
-    sp_model = SentencePieceProcessor(model_file=tokenizer_model_path)
-    _, atten_mask, _, k_caches, v_caches = example_inputs
-
-    # TODO: change criteria & support batch inputs if necessary
-    pos = torch.tensor(0, dtype=torch.int32)
-    token_list = [sp_model.bos_id()]
-    for prompt in user_prompts.split():
-        token_list += sp_model.encode(prompt)
-
-    def sample_top_p(probs: torch.Tensor, top_p: float) -> torch.Tensor:
-        probs_sort, probs_indices = torch.sort(probs, dim=-1, descending=True)
-        probs_sum = torch.cumsum(probs_sort, dim=-1)
-        mask = probs_sum - probs_sort > top_p
-        probs_sort[mask] = 0
-        probs_sort /= probs_sort.sum(dim=-1, keepdim=True)
-        next_token = torch.multinomial(probs_sort, num_samples=1)
-        return probs_indices.gather(dim=-1, index=next_token)
-
-    with torch.no_grad():
-        while token_list[-1] != sp_model.eos_id() and pos < max_seq_len - 1:
-            logits, new_k_caches, new_v_caches = module(
-                torch.full((1, 1), token_list[pos]),
-                atten_mask,
-                torch.full((1, 1), pos),
-                *k_caches,
-                *v_caches,
-            )
-            k_caches = [
-                torch.cat([k_cache[:, :, 1:], new_k_caches[i]], dim=-1)
-                for i, k_cache in enumerate(k_caches)
-            ]
-            v_caches = [
-                torch.cat([v_cache[:, 1:, :], new_v_caches[i]], dim=1)
-                for i, v_cache in enumerate(v_caches)
-            ]
-
-            pos += 1
-            atten_mask[0][-pos - 1] = 0
-            if pos >= len(token_list):
-                probs = torch.softmax(logits[:, -1] / 0.8, dim=-1)
-                token_list.append(sample_top_p(probs, 0.9).item())
-
-    print(f"calibration data:\n{sp_model.decode(token_list)}")
-
-
-def _batch_prefill_calibrate(
-    example_inputs,
-    user_prompts,
-    module: torch.fx.GraphModule,
-    tokenizer_model_path="tokenizer.model",
-    max_seq_len=512,
-):
-    sp_model = SentencePieceProcessor(model_file=tokenizer_model_path)
-    _, atten_mask = example_inputs
-    max_cache_len = max_seq_len - 1
-
-    # TODO: change criteria & support batch inputs if necessary
-    token_list = sp_model.encode(user_prompts, bos=True, eos=False)
-    token_list = torch.tensor(token_list)[:max_cache_len].reshape(1, -1)
-    last_prompt_pos = token_list.numel()
-    if last_prompt_pos < max_cache_len:
-        token_list = torch.cat(
-            [
-                token_list,
-                torch.zeros((1, max_cache_len - last_prompt_pos), dtype=torch.int32),
-            ],
-            dim=1,
-        )
-    else:
-        token_list = token_list[:, :max_cache_len]
-
-    with torch.no_grad():
-        logits, new_k_caches, new_v_caches = module(
-            token_list,
-            atten_mask,
-        )
-        predict = [torch.argmax(logits[:, last_prompt_pos - 1], dim=-1).item()]
-
-    print(f"calibration data:\n{sp_model.decode(predict)}")
-
-
-def calibrate(
-    example_inputs,
-    user_prompts,
-    module: torch.fx.GraphModule,
-    tokenizer_model_path="tokenizer.model",
-    max_seq_len=512,
-):
-    if len(example_inputs) == 2:
-        _batch_prefill_calibrate(
-            example_inputs,
-            user_prompts,
-            module,
-            tokenizer_model_path,
-            max_seq_len,
-        )
-    elif len(example_inputs) == 5:
-        _kv_calibrate(
-            example_inputs,
-            user_prompts,
-            module,
-            tokenizer_model_path,
-            max_seq_len,
-        )
-    else:
-        raise RuntimeError("Get wrong inputs")
-
-
-class SingleLlama:
-    def __init__(self, llama_model) -> None:
-        super().__init__()
-        self.llama_model = llama_model
-        self.quant_dtype = None
-        self.llama_meta = self.llama_model.get_metadata()
-        self.has_quant_io = False
-        if self.llama_meta["get_use_kv_cache"]:
-            tokens, atten_mask, pos_ids, k_caches, v_caches = self.get_example_inputs(
-                use_kv_cache=True
-            )
-            self.inputs = (tokens, atten_mask, pos_ids, *k_caches, *v_caches)
-        else:
-            tokens, atten_mask = self.get_example_inputs(use_kv_cache=False)
-            self.inputs = (tokens, atten_mask)
-
-    def _tag_kv_ios(self, gm: torch.fx.GraphModule, kv_type):
-        if not self.has_quant_io:
-            return
-
-        # shape of k caches and v caches
-        input_cache_shape = {
-            (self.llama_meta["get_head_dim"], self.llama_meta["get_max_seq_len"]),
-            (self.llama_meta["get_max_seq_len"], self.llama_meta["get_head_dim"]),
-        }
-        for n in gm.graph.nodes:
-            if (
-                n.op == "placeholder"
-                and len(users := list(n.users)) == 1
-                and users[0].meta["val"].size()[-2:] in input_cache_shape
-            ):
-                n.meta[QCOM_QUANTIZED_IO] = kv_type
-            elif n.op == "output":
-                for a in n.args[0]:
-                    # single head, kv mode
-                    if (
-                        a.meta["val"].flatten().size()[0]
-                        == self.llama_meta["get_head_dim"]
-                    ):
-                        a.meta[QCOM_QUANTIZED_IO] = kv_type
-                    # single head, batch_prefill mode
-                    elif a.meta["val"].flatten().size()[0] == self.llama_meta[
-                        "get_head_dim"
-                    ] * (self.llama_meta["get_max_seq_len"] - 1):
-                        a.meta[QCOM_QUANTIZED_IO] = kv_type
-
-    def quantize(self, quant_dtype, args, custom_annotations=()):
-        self.quant_dtype = quant_dtype
-        quantizer = make_quantizer(
-            quant_dtype=quant_dtype,
-            per_channel_conv=True,
-            per_channel_linear=True,
-            act_observer=MinMaxObserver,
-        )
-        quantizer.add_custom_quant_annotations(custom_annotations)
-
-        self.has_quant_io = True
-        fx_graph_module = None
-
-        with torch.no_grad():
-            fx_graph_module = torch.export.export(
-                self.llama_model, self.inputs, strict=True
-            ).module()
-            fx_graph_module = prepare_pt2e(fx_graph_module, quantizer)
-        print("Quantizing the model...")
-
-        calibrate(
-            self.get_example_inputs(self.llama_meta["get_use_kv_cache"]),
-            args.prompt,
-            fx_graph_module,
-            tokenizer_model_path=args.tokenizer_model,
-            max_seq_len=args.seq_len,
-        )
-
-        self.llama_model = convert_pt2e(fx_graph_module)
-
-    def lowering_modules(
-        self, work_space, kv_type=torch.uint8, soc_model=QcomChipset.SM8650
-    ):
-        executorch_config = ExecutorchBackendConfig(
-            passes=[
-                BuildQuantIo(),
-            ],
-            # For shared buffer, user must pass the memory address
-            # which is allocated by RPC memory to executor runner.
-            # Therefore, won't want to pre-allocate
-            # by memory manager in runtime.
-            memory_planning_pass=MemoryPlanningPass(
-                alloc_graph_input=False,
-                alloc_graph_output=False,
-            ),
-            extract_delegate_segments=True,
-        )
-        with torch.no_grad():
-            # backend option
-            backend_options = generate_htp_compiler_spec(use_fp16=False)
-            compiler_specs = generate_qnn_executorch_compiler_spec(
-                soc_model=soc_model,
-                backend_options=backend_options,
-                shared_buffer=True,
-            )
-            partitioner = QnnPartitioner(compiler_specs)
-            edge_prog = capture_program(self.llama_model, self.inputs)
-            self._tag_kv_ios(edge_prog.exported_program.graph_module, kv_type=kv_type)
-            edge_prog_mgr = EdgeProgramManager(
-                edge_programs={"forward": edge_prog.exported_program},
-                constant_methods=self.llama_meta,
-                compile_config=EdgeCompileConfig(_check_ir_validity=False),
-            )
-            edge_prog_mgr = edge_prog_mgr.to_backend(partitioner)
-            exec_prog_mgr = edge_prog_mgr.to_executorch(config=executorch_config)
-            with open(f"{work_space}/{pte_filename}.pte", "wb") as file:
-                exec_prog_mgr.write_to_file(file)
-
-    def get_example_inputs(self, use_kv_cache=True):
-        return self.llama_model.get_example_inputs(use_kv_cache)
-
-
-def compile(args):
-    os.makedirs(args.artifact, exist_ok=True)
-    start_ts = time.time()
-
-    if args.model_mode == "kv":
-        use_kv_cache = output_new_cache_only = True
-    elif args.model_mode == "batch_prefill" or args.model_mode == "hybrid":
-        raise NotImplementedError(
-            f"model_mode {args.model_mode} is not implemented yet."
-        )
-    else:
-        raise RuntimeError(f"No such model_mode {args.model_mode}.")
-
-    with open(args.params) as f:
-        config = ModelArgs(**json.load(f))
-        # TODO: support batch inputs if necessary
-        config.max_batch_size = 1
-        config.max_seq_len = args.seq_len
-        config.use_kv_cache = use_kv_cache
-    state_dict = torch.load(
-        args.checkpoint, weights_only=True, map_location="cpu", mmap=True
-    )
-    end_load_ts = time.time()
-    print("torch.load checkpoint", end_load_ts - start_ts)
-
-    llama_instance = None
-    with torch.device("meta"):
-        llama_instance = LlamaModel(config, output_new_cache_only=output_new_cache_only)
-    if "model" in state_dict:
-        state_dict = state_dict["model"]
-    llama_instance.load_state_dict(
-        state_dict,
-        strict=False,
-        assign=True,
-    )
-    end_load_state_dict_ts = time.time()
-    print("instance.load_state_dict", end_load_state_dict_ts - end_load_ts)
-
-    for layer in llama_instance.layers:
-        if getattr(layer.attention, "prepare_sha", None):
-            layer.attention.prepare_sha()
-
-    kv_type = torch.uint8
-    assert args.ptq in [
-        "8a8w",
-        "16a4w",
-    ], f"No support for quant type {args.ptq}. Support 8a8w and 16a4w."
-    quant_dtype = getattr(QuantDtype, f"use_{args.ptq}")
-    assert args.tokenizer_model is not None, "Need tokenizer model for calibration"
-
-    if args.dtype_override is not None:
-        dtype_override = DType[args.dtype_override]
-        llama_instance = llama_instance.to(dtype_override.to_torch_dtype())
-
-    llama_instance = convert_linear_to_conv2d(llama_instance)
-    single_llama = SingleLlama(llama_instance.eval())
-
-    start_quantize_ts = time.time()
-    single_llama.quantize(
-        quant_dtype,
-        args=args,
-        custom_annotations=(
-            annotate_matmul_16a8w,
-            annotate_linear_16a8w_in_affine_layer,
-        ),
-    )
-    end_quantize_ts = time.time()
-    print("single_llama.quantize(quant_dtype)", end_quantize_ts - start_quantize_ts)
-    single_llama.lowering_modules(
-        args.artifact, kv_type=kv_type, soc_model=get_soc_to_chipset_map()[args.model]
-    )
-    end_lowering_ts = time.time()
-    print("Complete Compile", end_lowering_ts - end_quantize_ts)
-
-
-def inference(args, pre_gen_pte=""):
-    workspace = f"/data/local/tmp/{getpass.getuser()}/executorch/single_llama"
-
-    if args.model_mode != "kv":
-        raise NotImplementedError(
-            f"model_mode {args.model_mode} is not implemented yet."
-        )
-
-    assert args.tokenizer_bin is not None, "Need tokenizer model for interence"
-    runner_args = " ".join(
-        [
-            f"--model_path {pte_filename}.pte",
-            "--output_folder_path outputs",
-            f"--tokenizer_path {os.path.basename(args.tokenizer_bin)}",
-            f'--prompt "{args.prompt}"',
-            f"--seq_len {args.seq_len}",
-            f"--temperature {args.temperature}",
-        ]
-    )
-    runner_cmd = " ".join(
-        [
-            f"cd {workspace} &&",
-            f"./qnn_llama_runner {runner_args}",
-        ]
-    )
-
-    pte_path = (
-        f"{pre_gen_pte}/{pte_filename}.pte"
-        if pre_gen_pte
-        else f"{args.artifact}/{pte_filename}.pte"
-    )
-    adb = SimpleADB(
-        qnn_sdk=os.getenv("QNN_SDK_ROOT"),
-        build_path=f"{args.build_folder}",
-        pte_path=pte_path,
-        workspace=workspace,
-        device_id=args.device,
-        host_id=args.host,
-        soc_model=args.model,
-        shared_buffer=args.shared_buffer,
-        runner="examples/qualcomm/oss_scripts/llama2/qnn_llama_runner",
-    )
-    # No pregen inputs, input_list is not required
-    adb.push(inputs=[], input_list="", files=[args.tokenizer_bin])
-    adb.execute(custom_runner_cmd=runner_cmd)
-
-    # collect output data
-    output_data_folder = f"{args.artifact}/outputs"
-    make_output_dir(output_data_folder)
-    outputs = []
-
-    def post_process():
-        for f in sorted(
-            os.listdir(output_data_folder), key=lambda f: int(f.split("_")[1])
-        ):
-            with codecs.open(
-                os.path.join(output_data_folder, f),
-                "r",
-                encoding="utf-8",
-                errors="replace",
-            ) as fdata:
-                outputs.append(fdata.read())
-
-    adb.pull(output_path=args.artifact, callback=post_process)
-
-    if args.ip and args.port != -1:
-        with Client((args.ip, args.port)) as conn:
-            conn.send(
-                json.dumps(
-                    {
-                        "result": outputs,
-                    }
-                )
-            )
-    else:
-        for idx, output in enumerate(outputs):
-            print(f"Results[{idx}]:\n{output}")
-
-
-def main():
-    parser = setup_common_args_and_variables()
-    parser.add_argument(
-        "-a",
-        "--artifact",
-        help="path for storing generated artifacts and output by this example. Default ./llama2_qnn",
-        default="./llama2_qnn",
-        type=str,
-    )
-
-    parser.add_argument(
-        "-P",
-        "--ptq",
-        help="If specified, will do PTQ quantization. default is 16bits activation and 4bits weight. Support 8a8w and 16a4w.",
-        default="16a4w",
-    )
-
-    parser.add_argument(
-        "--checkpoint",
-        help="Pass llama2 checkpoint.",
-        required=True,
-        type=str,
-    )
-
-    parser.add_argument(
-        "--params",
-        help="Pass llama2 params json file.",
-        required=True,
-        type=str,
-    )
-
-    parser.add_argument(
-        "--tokenizer_bin",
-        help="Pass llama2 tokenizer binary.",
-        required=False,
-        type=str,
-    )
-
-    parser.add_argument(
-        "--tokenizer_model",
-        help="Pass llama2 tokenizer model.",
-        type=str,
-        default=None,
-    )
-
-    parser.add_argument(
-        "--prompt",
-        help="User prompts for llama2.",
-        required=True,
-        type=str,
-    )
-
-    parser.add_argument(
-        "--seq_len",
-        help="Ouput sequence length for llama2.",
-        default=128,
-        type=int,
-    )
-
-    parser.add_argument(
-        "--temperature",
-        help="Sampling temperature for llama2.",
-        default=0.8,
-        type=float,
-    )
-
-    parser.add_argument(
-        "-d",
-        "--dtype-override",
-        default="fp32",
-        type=str,
-        choices=["fp32", "fp16"],
-        help="Override the dtype of the model (default is the checkpoint dtype). Options: fp32",
-    )
-
-    parser.add_argument(
-        "--pre_gen_pte",
-        help="Run the Pre-generated llama2 in the given directory",
-        type=str,
-    )
-
-    parser.add_argument(
-        "--num_sharding",
-        type=int,
-        default=0,
-        help="Specify the number of splits by inserting the fallback custom op. The graph will be split evenly by layers.",
-    )
-
-    parser.add_argument(
-        "--model_mode",
-        help="Export and inference batch_prefill mode, kv mode or hybrid(TBD) mode",
-        default="kv",
-        choices=["batch_prefill", "kv", "hybrid"],
-        type=str,
-    )
-
-    args = parser.parse_args()
-    if args.compile_only and args.pre_gen_pte:
-        exit("Cannot set both compile_only and pre_gen_pte as true")
-
-    if args.pre_gen_pte:
-        inference(args, args.pre_gen_pte)
-        exit(f"Finish the running pre_gen_pte from {args.pre_gen_pte}")
-
-    if args.compile_only:
-        compile(args)
-        exit(f"Finish compile_only and save to {args.artifact}")
-
-    try:
-        compile(args)
-        inference(args)
-    except Exception as e:
-        if args.ip and args.port != -1:
-            with Client((args.ip, args.port)) as conn:
-                conn.send(json.dumps({"Error": str(e)}))
-        else:
-            raise Exception(e)
-
-
-# flake8: noqa: C901
-if __name__ == "__main__":
-    main()
diff --git a/examples/qualcomm/oss_scripts/llama2/qnn_llama_runner.cpp b/examples/qualcomm/oss_scripts/llama2/qnn_llama_runner.cpp
deleted file mode 100644
index 1e46f919dc..0000000000
--- a/examples/qualcomm/oss_scripts/llama2/qnn_llama_runner.cpp
+++ /dev/null
@@ -1,94 +0,0 @@
-/*
- * Copyright (c) Qualcomm Innovation Center, Inc.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-/**
- * @file
- *
- * This tool can run ExecuTorch model files with Qualcomm AI Engine Direct.
- *
- * User could specify arguments like desired prompt, temperature, etc.
- */
-
-#include <executorch/backends/qualcomm/runtime/QnnExecuTorch.h>
-#include <executorch/examples/qualcomm/oss_scripts/llama2/runner/runner.h>
-#include <executorch/runtime/platform/log.h>
-
-#include <gflags/gflags.h>
-
-#include <fstream>
-#include <vector>
-
-DEFINE_string(
-    model_path,
-    "qnn_llama2.pte",
-    "Model serialized in flatbuffer format.");
-
-DEFINE_string(
-    output_folder_path,
-    "outputs",
-    "Executorch inference data output path.");
-
-DEFINE_string(tokenizer_path, "tokenizer.bin", "Tokenizer stuff.");
-
-DEFINE_string(prompt, "The answer to the ultimate question is", "Prompt.");
-
-DEFINE_double(
-    temperature,
-    0.8f,
-    "Temperature; Default is 0.8f. 0 = greedy argmax sampling (deterministic). Lower temperature = more deterministic");
-
-DEFINE_int32(
-    seq_len,
-    128,
-    "Total number of tokens to generate (prompt + output). Defaults to max_seq_len. If the number of input tokens + seq_len > max_seq_len, the output will be truncated to max_seq_len tokens.");
-
-using executorch::runtime::Error;
-using executorch::runtime::MemoryAllocator;
-using executorch::runtime::MethodMeta;
-using executorch::runtime::Result;
-
-int main(int argc, char** argv) {
-  gflags::ParseCommandLineFlags(&argc, &argv, true);
-
-  const char* tokenizer_path = FLAGS_tokenizer_path.c_str();
-  const char* prompt = FLAGS_prompt.c_str();
-  double temperature = FLAGS_temperature;
-  int32_t seq_len = FLAGS_seq_len;
-
-  // create llama runner
-  example::Runner runner(FLAGS_model_path, tokenizer_path, temperature);
-  ET_CHECK_MSG(runner.load() == Error::Ok, "Runner failed to load method");
-
-  // MethodMeta describes the memory requirements of the method.
-  Result<MethodMeta> method_meta = runner.get_method_meta();
-  ET_CHECK_MSG(
-      method_meta.ok(),
-      "Failed to get method_meta 0x%x",
-      (unsigned int)method_meta.error());
-  ET_CHECK_MSG(
-      runner.mem_alloc(MemoryAllocator::kDefaultAlignment, seq_len) ==
-          Error::Ok,
-      "Runner failed to allocate memory");
-
-  // generate tokens
-  std::string inference_output;
-  // prompt are determined by command line arguments
-  // pos_ids, atten_mask are infered inside runner
-  runner.generate(prompt, seq_len, [&](const std::string& piece) {
-    inference_output += piece;
-  });
-
-  size_t inference_index = 0;
-  auto output_file_name = FLAGS_output_folder_path + "/output_" +
-      std::to_string(inference_index++) + "_0.raw";
-  std::ofstream fout(output_file_name.c_str());
-  fout << inference_output;
-  fout.close();
-
-  return 0;
-}
diff --git a/examples/qualcomm/oss_scripts/llama2/runner/runner.cpp b/examples/qualcomm/oss_scripts/llama2/runner/runner.cpp
deleted file mode 100644
index 3f05512732..0000000000
--- a/examples/qualcomm/oss_scripts/llama2/runner/runner.cpp
+++ /dev/null
@@ -1,671 +0,0 @@
-/*
- * Copyright (c) Qualcomm Innovation Center, Inc.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-// A simple llama2 runner that includes preprocessing and post processing logic.
-// The module takes in a string as input and emits a string as output.
-
-#include <executorch/examples/qualcomm/oss_scripts/llama2/runner/runner.h>
-#include <executorch/extension/evalue_util/print_evalue.h>
-#include <executorch/extension/llm/runner/util.h>
-#include <executorch/extension/llm/tokenizer/bpe_tokenizer.h>
-#include <executorch/runtime/core/exec_aten/exec_aten.h>
-#include <executorch/runtime/core/exec_aten/util/scalar_type_util.h>
-#include <executorch/runtime/core/exec_aten/util/tensor_util.h>
-#include <executorch/runtime/platform/log.h>
-
-#include <ctime>
-#include <memory>
-#include <sstream>
-
-using executorch::aten::ScalarType;
-using executorch::aten::SizesType;
-using executorch::aten::Tensor;
-using executorch::extension::from_blob;
-using executorch::extension::Module;
-using executorch::extension::TensorPtr;
-using executorch::extension::llm::BPETokenizer;
-using executorch::extension::llm::Sampler;
-using executorch::extension::llm::time_in_ms;
-using executorch::runtime::Error;
-using executorch::runtime::EValue;
-using executorch::runtime::MethodMeta;
-using executorch::runtime::Result;
-using executorch::runtime::TensorInfo;
-
-// TODO: Remove this usage of an internal-only function.
-using executorch::runtime::internal::set_tensor_data;
-
-namespace example {
-
-namespace {
-static constexpr auto kTopp = 0.9f;
-void printReport(const Runner::Stats& stats);
-std::string statsToJsonString(const Runner::Stats& stats);
-} // namespace
-
-Runner::Runner(
-    const std::string& model_path,
-    const std::string& tokenizer_path,
-    const float temperature)
-    : module_(std::make_unique<Module>(
-          model_path,
-          Module::LoadMode::MmapUseMlockIgnoreErrors)),
-      tokenizer_path_(tokenizer_path),
-      model_path_(model_path),
-      temperature_(temperature) {
-  ET_LOG(
-      Info,
-      "Creating LLaMa runner: model_path=%s, tokenizer_path=%s",
-      model_path.c_str(),
-      tokenizer_path.c_str());
-}
-
-bool Runner::is_loaded() const {
-  return module_->is_loaded() && tokenizer_ && sampler_;
-}
-
-Error Runner::load() {
-  if (is_loaded()) {
-    return Error::Ok;
-  }
-  stats_.model_load_start_ms = time_in_ms();
-  ET_CHECK_OK_OR_RETURN_ERROR(module_->load_method("forward"));
-
-  // Read out metadata from the model
-  ET_LOG(Info, "Reading metadata from model");
-  const auto method_names = module_->method_names();
-  ET_CHECK_MSG(method_names.ok(), "Failed to read method names from model");
-  model_methods_ = method_names.get();
-  vocab_size_ = getMetadataHelper<int64_t>("get_vocab_size", 32000);
-  bos_id_ = getMetadataHelper<int64_t>("get_bos_id", 1);
-  eos_id_ = getMetadataHelper<int64_t>("get_eos_id", 2);
-  n_bos_ = getMetadataHelper<int64_t>("get_n_bos", 1);
-  n_eos_ = getMetadataHelper<int64_t>("get_n_eos", 1);
-  max_seq_len_ = getMetadataHelper<int64_t>("get_max_seq_len", 128);
-  head_dim_ = getMetadataHelper<int64_t>("get_head_dim", 32);
-  dim_ = getMetadataHelper<int64_t>("get_dim", 4096);
-
-  // Load tokenizer
-  tokenizer_ = std::make_unique<BPETokenizer>();
-  tokenizer_->load(tokenizer_path_);
-  if (tokenizer_->bos_tok() != bos_id_) {
-    ET_LOG(
-        Error,
-        "Tokenizer's BOS id %lu does not match model's BOS id %ld, will override tokenizer's BOS.",
-        tokenizer_->bos_tok(),
-        bos_id_);
-  }
-  if (tokenizer_->eos_tok() != eos_id_) {
-    ET_LOG(
-        Error,
-        "Tokenizer's EOS id %lu does not match model's EOS id %ld, will override tokenizer's EOS.",
-        tokenizer_->eos_tok(),
-        eos_id_);
-  }
-  // Create sampler
-  sampler_ = std::make_unique<Sampler>(
-      vocab_size_,
-      temperature_,
-      kTopp,
-      static_cast<unsigned long long>(std::time(nullptr)));
-  stats_.model_load_end_ms = time_in_ms();
-
-  return Error::Ok;
-}
-
-template <typename T>
-T Runner::getMetadataHelper(std::string method_name, T default_val) {
-  T res = default_val;
-  if (model_methods_.count(method_name)) {
-    Result<std::vector<EValue>> outputs = module_->execute(method_name);
-    if (outputs.ok()) {
-      std::vector<EValue> outs = outputs.get();
-      if (outs.size() > 0) {
-        res = outs[0].to<T>();
-      }
-    }
-  } else {
-    ET_LOG(
-        Info,
-        "The model does not contain %s method, using default value %lld",
-        method_name.c_str(),
-        (long long)default_val);
-  }
-  ET_LOG(Info, "%s: %lld", method_name.c_str(), (long long)res);
-  return res;
-}
-
-template <typename T>
-int32_t Runner::logitsToToken(const Tensor& logits_tensor) {
-  T* logits = logits_tensor.mutable_data_ptr<T>();
-
-  // Since the logits are for all tokens, get the last token probabilities
-  T* logits_last = logits;
-  return sampler_->sample(logits_last);
-}
-
-// Given an input token. Set up the inputs for the model and execute a single
-// step. Returning the logits tensor.
-Result<Tensor> Runner::run_model_step(
-    int64_t input_token,
-    TensorPtr& token,
-    TensorPtr& atten_mask,
-    TensorPtr& start_pos,
-    std::vector<TensorPtr>& kv_tensors,
-    std::vector<TensorPtr>& kv_outputs) {
-  token->mutable_data_ptr<int32_t>()[0] = input_token;
-
-  // inputs:[tokens, start_pos, atten_mask, k_cache, v_cache]
-  std::vector<executorch::runtime::EValue> inputs = {
-      token, atten_mask, start_pos};
-  inputs.insert(inputs.end(), kv_tensors.begin(), kv_tensors.end());
-  auto outputs_res = module_->forward(inputs);
-  ET_CHECK_OK_OR_RETURN_ERROR(outputs_res.error());
-
-  // TODO: need to handle batch size != 1
-  size_t v_offset = kv_outputs[0]->nbytes();
-  size_t el_size = kv_outputs[0]->element_size();
-  size_t k_input_step = (max_seq_len_ - 1) * el_size;
-  int k_tensors_end = kv_tensors.size() / 2;
-  // update k caches
-  for (int j = 0; j < k_tensors_end; ++j) {
-    uint8_t* input_addr =
-        static_cast<uint8_t*>(kv_tensors[j]->mutable_data_ptr());
-    uint8_t* output_addr =
-        static_cast<uint8_t*>(kv_outputs[j]->mutable_data_ptr());
-    // fill the output k values back
-    for (int src = 0, dst = k_input_step; src < kv_outputs[j]->nbytes();
-         src += el_size, dst += k_input_step) {
-      input_addr[dst] = output_addr[src];
-    }
-    char* new_inp_addr = io_mem_mgr_.update_k_caches_read(j, el_size);
-    // inputs
-    ET_CHECK_MSG(
-        set_tensor_data(
-            *kv_tensors[j], new_inp_addr, kv_tensors[j]->nbytes()) == Error::Ok,
-        "Failed to set input tensor when updating k_cache");
-  }
-  // update v caches
-  for (int j = k_tensors_end, v_idx = 0; j < kv_tensors.size(); ++j, ++v_idx) {
-    // inputs
-    char* new_inp_addr = io_mem_mgr_.update_v_caches_read(v_idx, v_offset);
-
-    ET_CHECK_MSG(
-        set_tensor_data(
-            *kv_tensors[j], new_inp_addr, kv_tensors[j]->nbytes()) == Error::Ok,
-        "Failed to set input tensor when updating v_cache");
-    // outputs
-    char* new_out_addr = io_mem_mgr_.update_v_caches_write(v_idx, v_offset);
-    ET_CHECK_MSG(
-        set_tensor_data(
-            *kv_outputs[j], new_out_addr, kv_outputs[j]->nbytes()) == Error::Ok,
-        "Failed to set output tensor when updating v_cache");
-    ET_CHECK_MSG(
-        module_->set_output(*kv_outputs[j], j + 1) == Error::Ok,
-        "Failed to set llama output data pointer");
-  }
-
-  // Bump start_pos by 1
-  start_pos->mutable_data_ptr<int32_t>()[0]++;
-
-  // update atten_mask
-  atten_mask->mutable_data_ptr<float>()
-      [atten_mask->numel() - 1 - start_pos->const_data_ptr<int32_t>()[0]] = 0;
-  return outputs_res.get()[0].toTensor();
-}
-// TODO: add overloaded method for on-device tokenize
-Error Runner::generate(
-    const std::string& prompt,
-    int32_t seq_len,
-    std::function<void(const std::string&)> token_callback,
-    std::function<void(const Stats&)> stats_callback) {
-  ET_CHECK_MSG(!prompt.empty(), "Prompt cannot be null");
-  ET_CHECK_MSG(is_loaded(), "Please invoke load method first");
-
-  // First token time only measures the time it takes to encode the prompt and
-  // return a response token.
-  stats_.inference_start_ms = time_in_ms();
-  shouldStop_ = false;
-
-  // Set the sequence length to the max seq length if not provided
-  seq_len = (seq_len > 0 && seq_len <= max_seq_len_) ? seq_len : max_seq_len_;
-
-  Result<std::vector<uint64_t>> encode_res =
-      tokenizer_->encode(prompt, n_bos_, 0);
-
-  ET_CHECK_OK_OR_RETURN_ERROR(
-      encode_res.error(), "Failed to encode prompt %s", prompt.c_str());
-
-  // encode the (string) prompt into tokens sequence
-  std::vector<uint64_t> prompt_tokens = encode_res.get();
-  int num_prompt_tokens = prompt_tokens.size();
-
-  ET_CHECK_MSG(
-      num_prompt_tokens < max_seq_len_,
-      "Max seq length exceeded - please increase max seq len value in static_llama.py");
-
-  ET_CHECK_MSG(
-      num_prompt_tokens < seq_len,
-      "Sequence length exceeded - please increase the seq_len value passed to generate()");
-
-  int32_t pos = 0, prev_token, cur_token = prompt_tokens[0];
-  std::vector<SizesType> token_shape = {1, 1};
-
-  io_mem_mgr_.get_input_token_ptr()[0] = 0;
-  std::vector<SizesType> start_pos_shape = {1, 1};
-
-  float* atten_mask_ptr =
-      reinterpret_cast<float*>(io_mem_mgr_.get_atten_mask_ptr());
-  std::fill(atten_mask_ptr, atten_mask_ptr + max_seq_len_, -255);
-  atten_mask_ptr[max_seq_len_ - 1] = 0;
-
-  std::vector<SizesType> atten_mask_shape = {1, max_seq_len_};
-
-  std::vector<SizesType> logits_data_shape = {1, vocab_size_};
-
-  std::vector<SizesType> hidden_states_data_shape = {1, 1, dim_};
-
-  // initialize tensor wrappers
-  auto token = from_blob(
-      io_mem_mgr_.get_input_token_ptr(), token_shape, ScalarType::Int);
-  auto start_pos = from_blob(
-      io_mem_mgr_.get_pos_idx_ptr(), start_pos_shape, ScalarType::Int);
-  auto atten_mask = from_blob(
-      io_mem_mgr_.get_atten_mask_ptr(), atten_mask_shape, ScalarType::Float);
-
-  std::vector<TensorPtr> kv_tensors, kv_outputs;
-
-  Result<MethodMeta> method_meta = get_method_meta();
-  size_t num_inputs = method_meta->num_inputs();
-  int k_caches_num = (num_inputs - 3) / 2;
-
-  // TODO: need to handle batch size != 1
-  // k caches init
-  for (int input_index = 3, i = 0; input_index < k_caches_num + 3;
-       ++input_index, ++i) {
-    // inputs
-    Result<TensorInfo> tensor_meta =
-        method_meta->input_tensor_meta(input_index);
-
-    auto tensor_shape = tensor_meta->sizes();
-    std::vector<SizesType> sizes(
-        tensor_shape.data(), tensor_shape.data() + tensor_shape.size());
-    kv_tensors.emplace_back(from_blob(
-        io_mem_mgr_.get_k_caches_read_ptr(i),
-        sizes,
-        tensor_meta->scalar_type()));
-
-    // outpus
-    Result<TensorInfo> out_tensor_meta = method_meta->output_tensor_meta(i + 1);
-    tensor_shape = out_tensor_meta->sizes();
-    sizes = std::vector<SizesType>{
-        tensor_shape.data(), tensor_shape.data() + tensor_shape.size()};
-    kv_outputs.emplace_back(from_blob(
-        io_mem_mgr_.get_k_caches_write_ptr(i),
-        sizes,
-        kv_tensors.back()->scalar_type()));
-    ET_CHECK_MSG(
-        module_->set_output(kv_outputs.back(), i + 1) == Error::Ok,
-        "Failed to set output tensor for kv cache");
-  }
-
-  // v caches init
-  for (int i = 0, input_index = k_caches_num + 3; input_index < num_inputs;
-       ++input_index, ++i) {
-    int output_index = i + k_caches_num + 1;
-    // inputs
-    Result<TensorInfo> tensor_meta =
-        method_meta->input_tensor_meta(input_index);
-    auto tensor_shape = tensor_meta->sizes();
-    std::vector<SizesType> sizes(
-        tensor_shape.data(), tensor_shape.data() + tensor_shape.size());
-
-    kv_tensors.emplace_back(from_blob(
-        io_mem_mgr_.get_v_caches_read_ptr(i),
-        sizes,
-        tensor_meta->scalar_type()));
-
-    // outputs
-    Result<TensorInfo> out_tensor_meta =
-        method_meta->output_tensor_meta(output_index);
-    tensor_shape = out_tensor_meta->sizes();
-    sizes = std::vector<SizesType>{
-        tensor_shape.data(), tensor_shape.data() + tensor_shape.size()};
-
-    kv_outputs.push_back(from_blob(
-        io_mem_mgr_.get_v_caches_write_ptr(i),
-        sizes,
-        kv_tensors.back()->scalar_type()));
-    ET_CHECK_MSG(
-        module_->set_output(kv_outputs.back(), output_index) == Error::Ok,
-        "Failed to set output tensor for llama block");
-  }
-
-  auto affine_logits = from_blob(
-      reinterpret_cast<float*>(io_mem_mgr_.get_logit_ptr()),
-      logits_data_shape,
-      ScalarType::Float);
-  ET_CHECK_MSG(
-      module_->set_output(affine_logits) == Error::Ok,
-      "Failed to set output tensor for affine module - logits");
-
-  // Start consuming user's prompts and generating new tokens
-  std::string final_output;
-  while (pos < seq_len - 1) {
-    // Run the model
-    auto logits_res = run_model_step(
-        cur_token, token, atten_mask, start_pos, kv_tensors, kv_outputs);
-    if (pos == num_prompt_tokens) {
-      stats_.first_token_ms = time_in_ms();
-    } else if (pos == num_prompt_tokens - 1) {
-      stats_.prompt_eval_end_ms = time_in_ms();
-    }
-
-    ET_CHECK_OK_OR_RETURN_ERROR(logits_res.error());
-    Tensor& logits_tensor = logits_res.get();
-    prev_token = cur_token;
-    long sample_start_time_ms = time_in_ms();
-
-    cur_token = logitsToToken<float>(logits_tensor);
-    stats_.aggregate_sampling_time_ms += time_in_ms() - sample_start_time_ms;
-
-    // advance the state machine
-    if (pos < num_prompt_tokens - 1) {
-      // prefill, force the next token to be the next prompt token
-      cur_token = prompt_tokens[pos + 1];
-    }
-    pos++;
-
-    // print the token as string, decode it with the Tokenizer object
-    auto piece_res = tokenizer_->decode(prev_token, cur_token);
-    ET_CHECK(piece_res.ok());
-
-    if (token_callback) {
-      token_callback(piece_res.get());
-    }
-
-    if (shouldStop_) {
-      break;
-    }
-
-    // data-dependent terminating condition: we have n_eos_ number of EOS
-    if (pos >= num_prompt_tokens && cur_token == eos_id_) {
-      ET_LOG(Info, "Reached to the end of generation");
-      break;
-    }
-  }
-  stats_.inference_end_ms = time_in_ms();
-
-  if (pos == seq_len) {
-    ET_LOG(Info, "Sequence length (%i tokens) reached!", seq_len);
-  }
-
-  stats_.num_prompt_tokens = num_prompt_tokens;
-  stats_.num_generated_tokens = pos - num_prompt_tokens;
-  printReport(stats_);
-  if (stats_callback) {
-    stats_callback(stats_);
-  }
-
-  return Error::Ok;
-}
-
-namespace {
-void printReport(const Runner::Stats& stats) {
-  printf("PyTorchObserver %s\n", statsToJsonString(stats).c_str());
-
-  ET_LOG(
-      Info,
-      "\tPrompt Tokens: %" PRIu64 "    Generated Tokens: %" PRIu64,
-      stats.num_prompt_tokens,
-      stats.num_generated_tokens);
-
-  ET_LOG(
-      Info,
-      "\tModel Load Time:\t\t%f (seconds)",
-      ((double)(stats.model_load_end_ms - stats.model_load_start_ms) /
-       stats.SCALING_FACTOR_UNITS_PER_SECOND));
-  double inference_time_ms =
-      (double)(stats.inference_end_ms - stats.inference_start_ms);
-  ET_LOG(
-      Info,
-      "\tTotal inference time:\t\t%f (seconds)\t\t Rate: \t%f (tokens/second)",
-      inference_time_ms / stats.SCALING_FACTOR_UNITS_PER_SECOND,
-
-      (stats.num_generated_tokens) /
-          (double)(stats.inference_end_ms - stats.inference_start_ms) *
-          stats.SCALING_FACTOR_UNITS_PER_SECOND);
-  double prompt_eval_time =
-      (double)(stats.prompt_eval_end_ms - stats.inference_start_ms);
-  ET_LOG(
-      Info,
-      "\t\tPrompt evaluation:\t%f (seconds)\t\t Rate: \t%f (tokens/second)",
-      prompt_eval_time / stats.SCALING_FACTOR_UNITS_PER_SECOND,
-      (stats.num_prompt_tokens) / prompt_eval_time *
-          stats.SCALING_FACTOR_UNITS_PER_SECOND);
-
-  double eval_time =
-      (double)(stats.inference_end_ms - stats.prompt_eval_end_ms);
-  ET_LOG(
-      Info,
-      "\t\tGenerated %" PRIu64
-      " tokens:\t%f (seconds)\t\t Rate: \t%f (tokens/second)",
-      stats.num_generated_tokens,
-      eval_time / stats.SCALING_FACTOR_UNITS_PER_SECOND,
-      stats.num_generated_tokens / eval_time *
-          stats.SCALING_FACTOR_UNITS_PER_SECOND);
-
-  // Time to first token is measured from the start of inference, excluding
-  // model load time.
-  ET_LOG(
-      Info,
-      "\tTime to first generated token:\t%f (seconds)",
-      ((double)(stats.first_token_ms - stats.inference_start_ms) /
-       stats.SCALING_FACTOR_UNITS_PER_SECOND));
-
-  ET_LOG(
-      Info,
-      "\tSampling time over %" PRIu64 " tokens:\t%f (seconds)",
-      stats.num_prompt_tokens + stats.num_generated_tokens,
-      (double)stats.aggregate_sampling_time_ms /
-          stats.SCALING_FACTOR_UNITS_PER_SECOND);
-}
-
-std::string statsToJsonString(const Runner::Stats& stats) {
-  std::stringstream ss;
-  ss << "{\"prompt_tokens\":" << stats.num_prompt_tokens << ","
-     << "\"generated_tokens\":" << stats.num_generated_tokens << ","
-     << "\"model_load_start_ms\":" << stats.model_load_start_ms << ","
-     << "\"model_load_end_ms\":" << stats.model_load_end_ms << ","
-     << "\"inference_start_ms\":" << stats.inference_start_ms << ","
-     << "\"inference_end_ms\":" << stats.inference_end_ms << ","
-     << "\"prompt_eval_end_ms\":" << stats.prompt_eval_end_ms << ","
-     << "\"first_token_ms\":" << stats.first_token_ms << ","
-     << "\"aggregate_sampling_time_ms\":" << stats.aggregate_sampling_time_ms
-     << "," << "\"SCALING_FACTOR_UNITS_PER_SECOND\":"
-     << stats.SCALING_FACTOR_UNITS_PER_SECOND << "}";
-  return ss.str();
-}
-} // namespace
-
-IoMemMgr::IoMemMgr(MethodMeta method_meta) {
-  method_meta_ = std::make_unique<MethodMeta>(method_meta);
-  init_io_info();
-  compute_total_nbytes();
-}
-
-void IoMemMgr::init_io_info() {
-  set_tensor_meta();
-  for (auto info : io_info_.tensor_info) {
-    info->size = info->tensor_meta->nbytes();
-    info->rank = info->tensor_meta->sizes().size();
-    info->shape.resize(info->rank);
-    for (int i = 0; i < info->rank; i++) {
-      info->shape[i] =
-          static_cast<uint32_t>(info->tensor_meta->sizes().data()[i]);
-    }
-    info->dtype = info->tensor_meta->scalar_type();
-    info->element_size = scalar_type_to_size[info->tensor_meta->scalar_type()];
-  }
-};
-
-void IoMemMgr::set_tensor_meta() {
-  io_info_.input_token.tensor_meta =
-      std::make_unique<TensorInfo>(method_meta_->input_tensor_meta(0).get());
-  io_info_.atten_mask.tensor_meta =
-      std::make_unique<TensorInfo>(method_meta_->input_tensor_meta(1).get());
-  io_info_.pos_idx.tensor_meta =
-      std::make_unique<TensorInfo>(method_meta_->input_tensor_meta(2).get());
-
-  io_info_.k_caches_read.tensor_meta =
-      std::make_unique<TensorInfo>(method_meta_->input_tensor_meta(3).get());
-  io_info_.k_caches_write.tensor_meta =
-      std::make_unique<TensorInfo>(method_meta_->output_tensor_meta(1).get());
-
-  io_info_.v_caches_read.tensor_meta = std::make_unique<TensorInfo>(
-      method_meta_->input_tensor_meta(method_meta_->num_inputs() - 1).get());
-  io_info_.v_caches_write.tensor_meta = std::make_unique<TensorInfo>(
-      method_meta_->output_tensor_meta(method_meta_->num_outputs() - 1).get());
-
-  io_info_.logit.tensor_meta =
-      std::make_unique<TensorInfo>(method_meta_->output_tensor_meta(0).get());
-}
-
-void IoMemMgr::compute_total_nbytes() {
-  total_nbytes_ = io_info_.input_token.size + io_info_.pos_idx.size +
-      io_info_.atten_mask.size + io_info_.logit.size;
-  size_t num_heads = (method_meta_->num_inputs() - 3) / 2;
-
-  // To update v cache via shifting pointer, v caches need a buffer with size
-  // of (max_seq_len_ - 1) * head_dim_. It is equivalent to one more cache
-  size_t num_v_cache = num_heads + 1;
-  // To update v cache via shifting pointer, k buffer need the size of
-  // max_seq_len - 1
-  size_t k_buffer = io_info_.k_caches_read.size / io_info_.k_caches_write.size;
-
-  // k_caches_read need a buffer with size of head_dim_
-  total_nbytes_ += num_heads * io_info_.k_caches_read.size + k_buffer;
-  total_nbytes_ += num_heads * io_info_.k_caches_write.size;
-  total_nbytes_ += num_v_cache * io_info_.v_caches_read.size;
-  // Add a head dim size for the convinience of shifting ptr from the last
-  // non-used v cache write
-  total_nbytes_ += io_info_.v_caches_write.size;
-}
-
-bool IoMemMgr::init_tensors() {
-  size_t cur_pos = input_token_pos_;
-  pos_idx_pos_ = cur_pos += io_info_.input_token.size;
-  atten_mask_pos_ = cur_pos += io_info_.pos_idx.size;
-  logit_pos_ = cur_pos += io_info_.atten_mask.size;
-  set_input_token_ptr();
-  set_pos_idx_ptr();
-  set_atten_mask_ptr();
-  set_logit_ptr();
-
-  // set start point of kv caches
-  cur_pos += io_info_.logit.size;
-
-  size_t num_heads = (method_meta_->num_inputs() - 3) / 2;
-  k_caches_read_pos_.resize(num_heads);
-  k_caches_write_pos_.resize(num_heads);
-  v_caches_read_pos_.resize(num_heads);
-  v_caches_write_pos_.resize(num_heads);
-
-  for (int i = 0; i < num_heads; i++) {
-    set_k_caches_read(i, cur_pos);
-    cur_pos += io_info_.k_caches_read.size;
-  }
-  // add a size of k caches buffer
-  cur_pos += io_info_.k_caches_read.size / io_info_.k_caches_write.size;
-  for (int i = 0; i < num_heads; i++) {
-    set_k_caches_write(i, cur_pos);
-    cur_pos += io_info_.k_caches_write.size;
-  }
-
-  for (int i = 0; i < num_heads; i++) {
-    set_v_caches_read(i, cur_pos);
-    set_v_caches_write(i, cur_pos + io_info_.v_caches_read.size);
-    cur_pos += io_info_.v_caches_read.size;
-  }
-  // add a caches as the b caches buffer
-  cur_pos += io_info_.v_caches_read.size;
-  return cur_pos <= total_nbytes_;
-}
-
-void IoMemMgr::set_all_shifted_ptrs(size_t seq_len) {
-  auto iter_setter = [&](std::vector<size_t>& cache,
-                         size_t shift_size,
-                         InfoAttrs& tensor_info) {
-    for (int i = 0; i < cache.size(); ++i) {
-      size_t pos = cache[i] + shift_size;
-      CustomMemTensorInfo info = {
-          ptr_,
-          ptr_ + pos,
-          pos,
-          tensor_info.size,
-          tensor_info.shape.data(),
-          tensor_info.rank,
-          tensor_info.dtype};
-      QnnExecuTorchAddCustomMemTensorInfo(info);
-    }
-  };
-  for (int i = 0; i < seq_len; ++i) {
-    iter_setter(
-        k_caches_read_pos_,
-        i * io_info_.k_caches_read.element_size,
-        io_info_.k_caches_read);
-    iter_setter(
-        v_caches_read_pos_,
-        i * io_info_.v_caches_write.size,
-        io_info_.v_caches_read);
-    iter_setter(
-        v_caches_write_pos_,
-        i * io_info_.v_caches_write.size,
-        io_info_.v_caches_write);
-  }
-}
-
-void Runner::stop() {
-  shouldStop_ = true;
-}
-
-Result<MethodMeta> Runner::get_method_meta() {
-  return module_->method_meta("forward");
-}
-
-Error Runner::mem_alloc(size_t alignment, size_t seq_len) {
-  Result<MethodMeta> method_meta_result = get_method_meta();
-  io_mem_mgr_ = IoMemMgr(method_meta_result.get());
-  ET_CHECK_MSG(
-      io_mem_mgr_.allocate(alignment),
-      "IoMemMgr failed to allocate custom memory");
-
-  ET_CHECK_MSG(
-      io_mem_mgr_.init_tensors(),
-      "IoMemMgr required more bytes than allocated bytes");
-
-  io_mem_mgr_.set_all_shifted_ptrs(seq_len);
-  // To register rpc_mem_handle from SharedBuffer
-  // Reset and re-init again to trigger registered function
-  module_.reset();
-  module_ = std::make_unique<Module>(
-      model_path_, Module::LoadMode::MmapUseMlockIgnoreErrors);
-  ET_CHECK_MSG(load() == Error::Ok, "Runner failed to load method");
-
-  return Error::Ok;
-}
-
-// explicit instantiation of template methods
-template int64_t Runner::getMetadataHelper<int64_t>(
-    std::string method_name,
-    int64_t default_val);
-template bool Runner::getMetadataHelper<bool>(
-    std::string method_name,
-    bool default_val);
-
-} // namespace example
diff --git a/examples/qualcomm/oss_scripts/llama2/runner/runner.h b/examples/qualcomm/oss_scripts/llama2/runner/runner.h
deleted file mode 100644
index aa0e5eb0ec..0000000000
--- a/examples/qualcomm/oss_scripts/llama2/runner/runner.h
+++ /dev/null
@@ -1,280 +0,0 @@
-/*
- * Copyright (c) Qualcomm Innovation Center, Inc.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-// A simple llama2 runner that includes preprocessing and post processing logic.
-// The module takes in a string as input and emits a string as output.
-
-#pragma once
-
-#include <cstdint>
-#include <functional>
-#include <memory>
-#include <string>
-#include <unordered_map>
-
-#include <executorch/backends/qualcomm/runtime/QnnExecuTorch.h>
-#include <executorch/extension/llm/sampler/sampler.h>
-#include <executorch/extension/llm/tokenizer/tokenizer.h>
-#include <executorch/extension/module/module.h>
-#include <executorch/extension/tensor/tensor.h>
-
-class RpcMemAllocator {
- public:
-  RpcMemAllocator(QnnMemDescriptor shared_buffer_type)
-      : shared_buffer_type_(shared_buffer_type){};
-  bool allocate(size_t bytes, size_t alignment) {
-    ptr_ = QnnExecuTorchAllocCustomMem(bytes, alignment);
-    if (ptr_ == nullptr) {
-      ET_LOG(
-          Info,
-          "Allocate Rpc mem falied, fallback to nromal ptr: bytes=%zu, alignment=%zu",
-          bytes,
-          alignment);
-      input_data_.resize(bytes);
-      ptr_ = input_data_.data();
-    }
-    return ptr_ != nullptr;
-  }
-
-  ~RpcMemAllocator() {
-    if (shared_buffer_type_ == QnnMemDescriptor::kIon ||
-        shared_buffer_type_ == QnnMemDescriptor::kCustom) {
-      if (ptr_ != nullptr) {
-        QnnExecuTorchFreeCustomMem(ptr_);
-      }
-    }
-  }
-
-  void* GetPtr() {
-    return ptr_;
-  }
-
- private:
-  QnnMemDescriptor shared_buffer_type_;
-  void* ptr_{nullptr};
-  std::vector<char> input_data_;
-  std::vector<size_t> tensor_base_addrs_;
-};
-
-#define DEFINE_IOMEMMGR_ACCESSOR(name)                  \
-  size_t get_##name##_pos() const {                     \
-    return name##_pos_;                                 \
-  }                                                     \
-  char* get_##name##_ptr() const {                      \
-    return reinterpret_cast<char*>(ptr_) + name##_pos_; \
-  }                                                     \
-  char* set_##name##_ptr() {                            \
-    CustomMemTensorInfo info = {                        \
-        ptr_,                                           \
-        ptr_ + name##_pos_,                             \
-        name##_pos_,                                    \
-        io_info_.name.size,                             \
-        io_info_.name.shape.data(),                     \
-        io_info_.name.rank,                             \
-        io_info_.name.dtype};                           \
-    QnnExecuTorchAddCustomMemTensorInfo(info);          \
-    return reinterpret_cast<char*>(ptr_) + name##_pos_; \
-  }
-
-#define DEFINE_IOMEMMGR_VEC_ACCESSOR(name)                   \
-  const std::vector<size_t>& get_##name##_pos_vec() const {  \
-    return name##_pos_;                                      \
-  }                                                          \
-  char* get_##name##_ptr(int idx) {                          \
-    return ptr_ + name##_pos_[idx];                          \
-  }                                                          \
-  char* set_##name(int idx, size_t pos) {                    \
-    name##_pos_[idx] = pos;                                  \
-    CustomMemTensorInfo info = {                             \
-        ptr_,                                                \
-        ptr_ + name##_pos_[idx],                             \
-        name##_pos_[idx],                                    \
-        io_info_.name.size,                                  \
-        io_info_.name.shape.data(),                          \
-        io_info_.name.rank,                                  \
-        io_info_.name.dtype};                                \
-    QnnExecuTorchAddCustomMemTensorInfo(info);               \
-    return reinterpret_cast<char*>(ptr_) + pos;              \
-  }                                                          \
-  char* update_##name(int idx, size_t shift_size) {          \
-    name##_pos_[idx] += shift_size;                          \
-    return reinterpret_cast<char*>(ptr_) + name##_pos_[idx]; \
-  }
-
-namespace example {
-class IoMemMgr {
- public:
-  // Allocate a big memory which is capable to contain all IO of all modules
-  IoMemMgr(){};
-  IoMemMgr(executorch::runtime::MethodMeta method_meta);
-
-  struct InfoAttrs {
-    std::unique_ptr<executorch::runtime::TensorInfo> tensor_meta;
-    size_t size = 0;
-    std::vector<uint32_t> shape;
-    uint32_t rank;
-    size_t element_size;
-    executorch::aten::ScalarType dtype;
-  };
-
-  struct IoInfo {
-    InfoAttrs input_token;
-    InfoAttrs atten_mask;
-    InfoAttrs pos_idx;
-    InfoAttrs k_caches_read;
-    InfoAttrs k_caches_write;
-    InfoAttrs v_caches_read;
-    InfoAttrs v_caches_write;
-    InfoAttrs logit;
-    std::vector<InfoAttrs*> tensor_info{
-        &input_token,
-        &atten_mask,
-        &pos_idx,
-        &k_caches_read,
-        &k_caches_write,
-        &v_caches_read,
-        &v_caches_write,
-        &logit,
-    };
-  };
-
-  bool allocate(size_t alignment) {
-    bool ret = rpc_mem_allocator.allocate(total_nbytes_, alignment);
-    ptr_ = reinterpret_cast<char*>(rpc_mem_allocator.GetPtr());
-    return ret;
-  }
-  bool init_tensors();
-
-  char* get_custom_mem_ptr() {
-    return ptr_;
-  }
-
-  // Pointers of k cache read, v cache read and write are shifted every step.
-  // Set them first to register mem handle during qnn delegation init.
-  void set_all_shifted_ptrs(size_t max_seq_len);
-
-  DEFINE_IOMEMMGR_ACCESSOR(atten_mask);
-  DEFINE_IOMEMMGR_ACCESSOR(input_token);
-  DEFINE_IOMEMMGR_ACCESSOR(pos_idx);
-  DEFINE_IOMEMMGR_ACCESSOR(logit);
-
-  DEFINE_IOMEMMGR_VEC_ACCESSOR(k_caches_read);
-  DEFINE_IOMEMMGR_VEC_ACCESSOR(k_caches_write);
-  DEFINE_IOMEMMGR_VEC_ACCESSOR(v_caches_read);
-  DEFINE_IOMEMMGR_VEC_ACCESSOR(v_caches_write);
-
- private:
-  size_t total_nbytes_{0};
-  char* ptr_{nullptr};
-  void compute_total_nbytes();
-  void set_tensor_meta();
-  void init_io_info();
-
-  size_t atten_mask_pos_;
-  size_t input_token_pos_{0};
-  size_t logit_pos_;
-  size_t pos_idx_pos_;
-  std::vector<size_t> k_caches_read_pos_;
-  std::vector<size_t> k_caches_write_pos_;
-  std::vector<size_t> v_caches_read_pos_;
-  std::vector<size_t> v_caches_write_pos_;
-
-  IoInfo io_info_;
-  std::unique_ptr<executorch::runtime::MethodMeta> method_meta_;
-  RpcMemAllocator rpc_mem_allocator{QnnMemDescriptor::kCustom};
-  std::unordered_map<executorch::aten::ScalarType, size_t> scalar_type_to_size =
-      {
-          {executorch::aten::ScalarType::Int, sizeof(int32_t)},
-          {executorch::aten::ScalarType::Float, sizeof(float)},
-          {executorch::aten::ScalarType::Char, sizeof(int8_t)},
-          {executorch::aten::ScalarType::Short, sizeof(int16_t)},
-          {executorch::aten::ScalarType::Byte, sizeof(uint8_t)},
-          {executorch::aten::ScalarType::Bits16, sizeof(uint16_t)},
-  };
-};
-
-class Runner {
- public:
-  explicit Runner(
-      const std::string& model_path,
-      const std::string& tokenizer_path,
-      const float temperature = 0.8f);
-
-  struct Stats {
-    // Scaling factor for timestamps - in this case, we use ms.
-    const long SCALING_FACTOR_UNITS_PER_SECOND = 1000;
-    // Time stamps for the different stages of the execution
-    // model_load_start_ms: Start of model loading.
-    long model_load_start_ms;
-    // model_load_end_ms: End of model loading.
-    long model_load_end_ms;
-    // inference_start_ms: Immediately after the model is loaded (or we check
-    // for model load), measure the inference time.
-    long inference_start_ms;
-    // prompt_eval_end_ms: Prompt array allocation and tokenization. Ends right
-    // before the inference loop starts
-    long prompt_eval_end_ms;
-    // first_token: Timestamp when the first generated token is emitted
-    long first_token_ms;
-    // inference_end_ms: End of inference/generation.
-    long inference_end_ms;
-    // Keep a running total of the time spent in sampling.
-    long aggregate_sampling_time_ms;
-    // Token count from prompt
-    int64_t num_prompt_tokens;
-    // Token count from generated (total - prompt)
-    int64_t num_generated_tokens;
-  };
-
-  bool is_loaded() const;
-  executorch::runtime::Error load();
-  executorch::runtime::Error mem_alloc(size_t alignment, size_t seq_len);
-  executorch::runtime::Error generate(
-      const std::string& prompt,
-      int32_t seq_len,
-      std::function<void(const std::string&)> token_callback = {},
-      std::function<void(const Stats&)> stats_callback = {});
-  void stop();
-  executorch::runtime::Result<executorch::runtime::MethodMeta>
-  get_method_meta();
-
- private:
-  // metadata
-  template <typename T>
-  T getMetadataHelper(std::string method_name, T default_val);
-  template <typename T>
-  int32_t logitsToToken(const executorch::aten::Tensor& logits_tensor);
-  executorch::runtime::Result<executorch::aten::Tensor> run_model_step(
-      int64_t input_token,
-      ::executorch::extension::TensorPtr& token,
-      ::executorch::extension::TensorPtr& atten_mask,
-      ::executorch::extension::TensorPtr& start_pos,
-      std::vector<::executorch::extension::TensorPtr>& kv_tensors,
-      std::vector<::executorch::extension::TensorPtr>& kv_outputs);
-  // metadata
-  int32_t vocab_size_;
-  int64_t bos_id_;
-  int64_t eos_id_;
-  int32_t n_bos_;
-  int32_t n_eos_;
-  int32_t max_seq_len_;
-  int32_t head_dim_;
-  int32_t dim_;
-  std::unordered_set<std::string> model_methods_;
-  std::unique_ptr<executorch::extension::Module> module_;
-  std::string tokenizer_path_;
-  std::string model_path_;
-  float temperature_;
-  std::unique_ptr<executorch::extension::llm::Tokenizer> tokenizer_;
-  std::unique_ptr<executorch::extension::llm::Sampler> sampler_;
-  bool shouldStop_{false};
-  Stats stats_;
-  IoMemMgr io_mem_mgr_;
-};
-
-} // namespace example
diff --git a/examples/qualcomm/oss_scripts/llama3_2/README.md b/examples/qualcomm/oss_scripts/llama3_2/README.md
deleted file mode 100644
index 51de982b1b..0000000000
--- a/examples/qualcomm/oss_scripts/llama3_2/README.md
+++ /dev/null
@@ -1,39 +0,0 @@
-# Summary
-
-## Overview
-This file provides instructions to run LLAMA3.2 1B and 3B (WIP) with different parameters via the Qualcomm HTP backend. In LLAMA3.2, we offer the following modes to execute the model:
-
-Prefill Mode: This is also known as batch prefill mode, where the model takes in a list of tokens as input and generates the next token along with the key-value (KV) cache for all tokens. This mode is efficient for generating the initial sequence of tokens (usually the user's prompt).
-
-KV Cache Mode: In KV Cache mode, the model takes in a single previous token and generates the next predicted token along with its KV cache. It is efficient for generating subsequent tokens after the initial prompt.
-
-Hybrid Mode: Hybrid mode leverages the strengths of both batch prefill and KV cache modes to optimize token generation speed. Initially, it uses prefill mode to efficiently generate the prompt's key-value (KV) cache. Then, the mode switches to KV cache mode, which excels at generating subsequent tokens.
-
-## Instructions
-### Note
-1. For hybrid mode, the export time will be longer and can take up to 2-4 hours to complete.
-2. When exporting a hybrid mode model, please ensure the device has at least 80 GB of memory and swap space.
-
-### Step 1: Setup
-1. Follow the [tutorial](https://pytorch.org/executorch/main/getting-started-setup) to set up ExecuTorch.
-2. Follow the [tutorial](https://pytorch.org/executorch/stable/build-run-qualcomm-ai-engine-direct-backend.html) to build Qualcomm AI Engine Direct Backend.
-
-### Step 2: Prepare Model
-1. Follow the [instructions](https://www.llama.com/) to download models.
-At the end of this step, users should have the following files ready: consolidated.00.pth, params.json, and tokenizer.model.
-
-### Step3: Run default examples using hybrid mode.
-Default example using hybrid mode.
-```bash
-python examples/qualcomm/oss_scripts/llama3_2/llama.py -b build-android -s ${SERIAL_NUM} -m ${SOC_MODEL} --ptq 16a4w --checkpoint consolidated.00.pth --params params.json --tokenizer_model tokenizer.model --prompt "what is 1+1" --temperature 0 --model_size 1B --model_mode hybrid --prefill_seq_len 32 --kv_seq_len 128
-```
-
-If you would like to compile the model only, we have provided the flag `--compile_only`.
-```bash
-python examples/qualcomm/oss_scripts/llama3_2/llama.py -b build-android -m ${SOC_MODEL} --ptq 16a4w --checkpoint consolidated.00.pth --params params.json --tokenizer_model tokenizer.model --prompt "what is 1+1" --temperature 0 --model_size 1B --model_mode hybrid --prefill_seq_len 32 --kv_seq_len 128 --compile_only
-```
-
-On the other hand, if you already have a pre-compiled .pte model, you can perform inference by providing the flag `--pre_gen_pte` and specifying the folder that contains the .pte model.
-```bash
-python examples/qualcomm/oss_scripts/llama3_2/llama.py -b build-android -s ${SERIAL_NUM} -m ${SOC_MODEL} --ptq 16a4w --checkpoint consolidated.00.pth --params params.json --tokenizer_model tokenizer.model --prompt "what is 1+1" --temperature 0 --model_size 1B --model_mode hybrid --prefill_seq_len 32 --kv_seq_len 128 --pre_gen_pte ${FOLDER_TO_PRE_GEN_PTE}
-```
\ No newline at end of file
diff --git a/examples/qualcomm/oss_scripts/llama3_2/TARGETS b/examples/qualcomm/oss_scripts/llama3_2/TARGETS
index cab2076f8d..e69de29bb2 100644
--- a/examples/qualcomm/oss_scripts/llama3_2/TARGETS
+++ b/examples/qualcomm/oss_scripts/llama3_2/TARGETS
@@ -1,37 +0,0 @@
-load("@fbcode_macros//build_defs:python_library.bzl", "python_library")
-load("@fbsource//xplat/executorch/backends/qualcomm/qnn_version.bzl", "get_qnn_library_verision")
-load("@fbcode_macros//build_defs:python_binary.bzl", "python_binary")
-load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
-
-oncall("executorch")
-
-python_binary(
-    name = "llama",
-    srcs = ["llama.py"],
-    main_function = "executorch.examples.qualcomm.oss_scripts.llama3_2.llama.main",
-    preload_deps = [
-        "//executorch/extension/llm/custom_ops:model_sharding_py",
-    ],
-    deps = [
-        "//executorch/examples/qualcomm/oss_scripts/llama2:static_llama",
-        "//caffe2:torch",
-        "//executorch/extension/pybindings:aten_lib",
-        "//executorch/backends/qualcomm/partition:partition",
-        "//executorch/backends/qualcomm/quantizer:quantizer",
-        "//executorch/devtools:lib",
-        "//executorch/examples/models:models",
-        "//executorch/examples/qualcomm:utils",
-        "//executorch/extension/export_util:export_util",
-        "//executorch/extension/llm/export:export_lib",
-    ],
-)
-
-runtime.command_alias(
-    name = "llama_qnn",
-    env = {
-        "LD_LIBRARY_PATH": "$(location fbsource//third-party/qualcomm/qnn/qnn-{0}:qnn_offline_compile_libs)".format(get_qnn_library_verision()),
-        # Place holder to pass the QNN_SDK_ROOT check in executorch/examples/qualcomm/utils.py
-        "QNN_SDK_ROOT": "",
-    },
-    exe = ":llama",
-)
diff --git a/examples/qualcomm/oss_scripts/llama3_2/runner/io_memory.cpp b/examples/qualcomm/oss_scripts/llama3_2/runner/io_memory.cpp
deleted file mode 100644
index 941ff97685..0000000000
--- a/examples/qualcomm/oss_scripts/llama3_2/runner/io_memory.cpp
+++ /dev/null
@@ -1,508 +0,0 @@
-/*
- * Copyright (c) Qualcomm Innovation Center, Inc.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <executorch/examples/qualcomm/oss_scripts/llama3_2/runner/io_memory.h>
-#include <executorch/runtime/core/exec_aten/util/tensor_util.h>
-#include <algorithm>
-
-using executorch::aten::Tensor;
-using executorch::aten::TensorImpl;
-using executorch::extension::Module;
-using executorch::runtime::Error;
-using executorch::runtime::MethodMeta;
-using executorch::runtime::Result;
-using executorch::runtime::TensorInfo;
-
-namespace example {
-
-Memory::Memory(std::vector<std::shared_ptr<Module>>& modules)
-    : data_ptr_(nullptr, [](void*) {}), modules_(modules) {}
-
-Memory::~Memory() {}
-
-void* Memory::get_mutable_ptr() {
-  return data_ptr_.get();
-}
-
-std::vector<Tensor> Memory::get_input_tensors(
-    int shard_index,
-    const std::string& method_name) {
-  std::vector<Tensor> ret;
-  ret.reserve(input_tensors_.size());
-  for (TensorImpl* impl : input_tensors_[method_name][shard_index]) {
-    ret.emplace_back(Tensor(impl));
-  }
-  return ret;
-}
-
-std::vector<Tensor> Memory::get_output_tensors(
-    int shard_index,
-    const std::string& method_name) {
-  std::vector<Tensor> ret;
-  ret.reserve(output_tensors_[method_name][shard_index].size());
-  for (TensorImpl* impl : output_tensors_[method_name][shard_index]) {
-    ret.emplace_back(Tensor(impl));
-  }
-  return ret;
-}
-
-HybridMemory::HybridMemory(
-    std::vector<std::shared_ptr<Module>>& modules,
-    int32_t prefill_cache_len,
-    int32_t kv_cache_len,
-    int32_t vocab_size,
-    int32_t num_layers,
-    int32_t head_dim,
-    int32_t num_heads,
-    EvalMode eval_mode,
-    const std::string& prefill_forward_name,
-    const std::string& kv_forward_name)
-    : Memory(modules),
-      shard_layers_({num_layers}),
-      kv_cache_len_(kv_cache_len),
-      prefill_cache_len_(prefill_cache_len),
-      vocab_size_(vocab_size),
-      num_layers_(num_layers),
-      head_dim_(head_dim),
-      num_heads_(num_heads),
-      eval_mode_(eval_mode),
-      prefill_forward_name_(prefill_forward_name),
-      kv_forward_name_(kv_forward_name) {
-  if (!prefill_forward_name_.empty()) {
-    input_tensors_[prefill_forward_name_] =
-        std::vector<std::vector<executorch::aten::TensorImpl*>>(modules.size());
-    output_tensors_[prefill_forward_name_] =
-        std::vector<std::vector<executorch::aten::TensorImpl*>>(modules.size());
-    k_cache_in_[prefill_forward_name_] =
-        std::vector<std::unique_ptr<executorch::aten::TensorImpl>>();
-    v_cache_in_[prefill_forward_name_] =
-        std::vector<std::unique_ptr<executorch::aten::TensorImpl>>();
-    k_cache_out_[prefill_forward_name_] =
-        std::vector<std::unique_ptr<executorch::aten::TensorImpl>>();
-    v_cache_out_[prefill_forward_name_] =
-        std::vector<std::unique_ptr<executorch::aten::TensorImpl>>();
-  }
-  if (!kv_forward_name_.empty()) {
-    input_tensors_[kv_forward_name_] =
-        std::vector<std::vector<executorch::aten::TensorImpl*>>(modules.size());
-    output_tensors_[kv_forward_name_] =
-        std::vector<std::vector<executorch::aten::TensorImpl*>>(modules.size());
-    k_cache_in_[kv_forward_name_] =
-        std::vector<std::unique_ptr<executorch::aten::TensorImpl>>();
-    v_cache_in_[kv_forward_name_] =
-        std::vector<std::unique_ptr<executorch::aten::TensorImpl>>();
-    k_cache_out_[kv_forward_name_] =
-        std::vector<std::unique_ptr<executorch::aten::TensorImpl>>();
-    v_cache_out_[kv_forward_name_] =
-        std::vector<std::unique_ptr<executorch::aten::TensorImpl>>();
-  }
-
-  data_ptr_ = std::unique_ptr<void, void (*)(void*)>(
-      new IO, [](void* ptr) { delete static_cast<IO*>(ptr); });
-}
-
-void HybridMemory::init_io() {
-  IO* ptr = static_cast<IO*>(data_ptr_.get());
-  std::memset(ptr, 0, sizeof(IO));
-
-  int32_t max_cache_len = std::max(kv_cache_len_, prefill_cache_len_);
-  int32_t k_in_size = (head_dim_ + 1) * max_cache_len;
-  int32_t v_cache_size = (num_heads_ + 1) * max_cache_len * head_dim_;
-  int32_t k_cache_out_size = num_heads_ * head_dim_;
-  if (eval_mode_ == EvalMode::kHybrid || eval_mode_ == EvalMode::kPrefill) {
-    k_cache_out_size *= prefill_cache_len_;
-  }
-
-  // Init kv vector shape, general enough to be shared across all 3 modes.
-  ptr->k_cache_out.reserve(num_layers_);
-  ptr->v_cache.reserve(num_layers_);
-  for (int layer = 0; layer < num_layers_; layer++) {
-    ptr->k_cache_out.emplace_back(std::vector<uint8_t>(k_cache_out_size));
-    ptr->v_cache.emplace_back(std::vector<uint8_t>(v_cache_size));
-  }
-
-  auto init_prefill = [&]() {
-    ptr->prefill_input_toks.resize(prefill_cache_len_);
-    ptr->prefill_atten_mask.resize(prefill_cache_len_ * prefill_cache_len_);
-    ptr->prefill_logits.resize(prefill_cache_len_ * vocab_size_);
-  };
-
-  auto init_kv = [&]() {
-    ptr->kv_logits.resize(vocab_size_);
-    ptr->kv_attention_mask.resize((kv_cache_len_ + 1), 0);
-    ptr->k_cache.reserve(num_layers_);
-    for (int layer = 0; layer < num_layers_; layer++) {
-      ptr->k_cache.emplace_back();
-      ptr->k_cache[layer].reserve(num_heads_);
-      for (int head = 0; head < num_heads_; head++) {
-        ptr->k_cache[layer].emplace_back(std::vector<uint8_t>(k_in_size));
-      }
-    }
-  };
-
-  switch (eval_mode_) {
-    case EvalMode::kPrefill:
-      init_prefill();
-      break;
-    case EvalMode::kKVCached:
-      init_kv();
-      break;
-    case EvalMode::kHybrid:
-      init_prefill();
-      init_kv();
-      break;
-    default:
-      break;
-  }
-}
-
-void HybridMemory::prepare_kv_io(
-    const std::vector<Result<MethodMeta>>& methods_meta) {
-  for (int i = 0; i < modules_.size(); ++i) {
-    ET_CHECK_MSG(
-        methods_meta[i].ok(),
-        "Failed to get method_meta 0x%x",
-        static_cast<uint32_t>(methods_meta[i].error()));
-  }
-
-  ET_CHECK_MSG(!(kv_forward_name_.empty()), "kv forward name is empty");
-  IO* ptr = static_cast<IO*>(data_ptr_.get());
-
-  // [I]: input_tokens
-  Result<TensorInfo> input_tok = methods_meta[0]->input_tensor_meta(0);
-  input_tok_ = std::make_unique<TensorImpl>(
-      input_tok->scalar_type(),
-      input_tok->sizes().size(),
-      const_cast<TensorImpl::SizesType*>(input_tok->sizes().data()),
-      &ptr->input_tok,
-      const_cast<TensorImpl::DimOrderType*>(input_tok->dim_order().data()));
-  input_tensors_[kv_forward_name_][0].push_back(input_tok_.get());
-
-  // [I]: atten_mask
-  Result<TensorInfo> atten_mask = methods_meta[0]->input_tensor_meta(1);
-  attention_mask_ = std::make_unique<TensorImpl>(
-      atten_mask->scalar_type(),
-      atten_mask->sizes().size(),
-      const_cast<TensorImpl::SizesType*>(atten_mask->sizes().data()),
-      ptr->kv_attention_mask.data(),
-      const_cast<TensorImpl::DimOrderType*>(atten_mask->dim_order().data()));
-  input_tensors_[kv_forward_name_][0].push_back(attention_mask_.get());
-
-  // [I]: input_pos
-  Result<TensorInfo> input_pos = methods_meta[0]->input_tensor_meta(2);
-  input_pos_ = std::make_unique<TensorImpl>(
-      input_pos->scalar_type(),
-      input_pos->sizes().size(),
-      const_cast<TensorImpl::SizesType*>(input_pos->sizes().data()),
-      &ptr->input_pos,
-      const_cast<TensorImpl::DimOrderType*>(input_pos->dim_order().data()));
-  input_tensors_[kv_forward_name_][0].push_back(input_pos_.get());
-
-  // [I] kv_cache
-  int index = 3; // bypass input_tokens, input_pos, atten_mask
-  for (int offset = 0, shard_index = 0, v_stride = kv_cache_len_ * head_dim_;
-       shard_index < modules_.size();
-       offset += shard_layers_[shard_index], shard_index++) {
-    for (int cache_group = 0; cache_group < 2; ++cache_group) {
-      for (int layer = 0; layer < shard_layers_[shard_index]; ++layer) {
-        for (int head = 0; head < num_heads_; ++head, ++index) {
-          Result<TensorInfo> kv_cache =
-              methods_meta[shard_index]->input_tensor_meta(index);
-          std::vector<std::unique_ptr<TensorImpl>>& cache =
-              (cache_group == 0 ? k_cache_in_[kv_forward_name_]
-                                : v_cache_in_[kv_forward_name_]);
-          void* cache_ptr = (cache_group == 0)
-              ? static_cast<void*>(ptr->k_cache[layer + offset][head].data())
-              : static_cast<void*>(
-                    ptr->v_cache[layer + offset].data() + head * v_stride);
-
-          cache.emplace_back(std::make_unique<TensorImpl>(
-              kv_cache->scalar_type(),
-              kv_cache->sizes().size(),
-              const_cast<TensorImpl::SizesType*>(kv_cache->sizes().data()),
-              cache_ptr,
-              const_cast<TensorImpl::DimOrderType*>(
-                  kv_cache->dim_order().data())));
-          input_tensors_[kv_forward_name_][shard_index].push_back(
-              cache.back().get());
-        }
-      }
-    }
-  }
-
-  // [O]: logits
-  int logit_index = 0;
-  Result<TensorInfo> logits =
-      methods_meta[modules_.size() - 1]->output_tensor_meta(logit_index);
-  kv_logits_ = std::make_unique<TensorImpl>(
-      logits->scalar_type(),
-      logits->sizes().size(),
-      const_cast<TensorImpl::SizesType*>(logits->sizes().data()),
-      ptr->kv_logits.data(),
-      const_cast<TensorImpl::DimOrderType*>(logits->dim_order().data()));
-  output_tensors_[kv_forward_name_][modules_.size() - 1].push_back(
-      kv_logits_.get());
-
-  // [O] kv_cache
-  index = 1;
-  // Iterate through all kv cache outputs.
-  // For k, we store it in k_cache_out and update to k_cache later.
-  // For v, we append the output to the end of v_cache,
-  // which serves as both input and output.
-  for (int offset = 0, shard_index = 0, v_stride = kv_cache_len_ * head_dim_;
-       shard_index < modules_.size();
-       offset += shard_layers_[shard_index], shard_index++) {
-    for (int cache_group = 0; cache_group < 2; ++cache_group) {
-      for (int layer = 0; layer < shard_layers_[shard_index]; ++layer) {
-        for (int head = 0; head < num_heads_; ++head, ++index) {
-          Result<TensorInfo> kv_cache =
-              methods_meta[shard_index]->output_tensor_meta(index);
-          std::vector<std::unique_ptr<TensorImpl>>& cache =
-              (cache_group == 0 ? k_cache_out_[kv_forward_name_]
-                                : v_cache_out_[kv_forward_name_]);
-          void* cache_ptr = (cache_group == 0)
-              ? static_cast<void*>(
-                    ptr->k_cache_out[layer + offset].data() +
-                    (head * head_dim_))
-              : static_cast<void*>(
-                    ptr->v_cache[layer + offset].data() +
-                    (head + 1) * v_stride);
-          cache.emplace_back(std::make_unique<TensorImpl>(
-              kv_cache->scalar_type(),
-              kv_cache->sizes().size(),
-              const_cast<TensorImpl::SizesType*>(kv_cache->sizes().data()),
-              cache_ptr,
-              const_cast<TensorImpl::DimOrderType*>(
-                  kv_cache->dim_order().data())));
-          output_tensors_[kv_forward_name_][shard_index].push_back(
-              cache.back().get());
-        }
-      }
-    }
-  }
-}
-
-void HybridMemory::prepare_prefill_io(
-    const std::vector<Result<MethodMeta>>& methods_meta) {
-  for (int i = 0; i < modules_.size(); ++i) {
-    ET_CHECK_MSG(
-        methods_meta[i].ok(),
-        "Failed to get method_meta 0x%x",
-        static_cast<uint32_t>(methods_meta[i].error()));
-  }
-
-  ET_CHECK_MSG(
-      !(prefill_forward_name_.empty()), "prefill forward name is empty");
-
-  IO* ptr = static_cast<IO*>(data_ptr_.get());
-
-  // [I]: pre_input_tokens
-  Result<TensorInfo> prefill_input_toks = methods_meta[0]->input_tensor_meta(0);
-  prefill_input_toks_ = std::make_unique<TensorImpl>(
-      prefill_input_toks->scalar_type(),
-      prefill_input_toks->sizes().size(),
-      const_cast<TensorImpl::SizesType*>(prefill_input_toks->sizes().data()),
-      ptr->prefill_input_toks.data(),
-      const_cast<TensorImpl::DimOrderType*>(
-          prefill_input_toks->dim_order().data()));
-  input_tensors_[prefill_forward_name_][0].push_back(prefill_input_toks_.get());
-  // [I]: prefill_attn_mask
-  for (int i = 0; i < prefill_cache_len_; ++i) {
-    for (int j = 0; j < prefill_cache_len_; ++j) {
-      if (i < j) {
-        ptr->prefill_atten_mask[i * prefill_cache_len_ + j] = 0;
-      } else {
-        ptr->prefill_atten_mask[i * prefill_cache_len_ + j] = 65535;
-      }
-    }
-  }
-  Result<TensorInfo> prefill_atten_mask = methods_meta[0]->input_tensor_meta(1);
-  prefill_attn_mask_ = std::make_unique<TensorImpl>(
-      prefill_atten_mask->scalar_type(),
-      prefill_atten_mask->sizes().size(),
-      const_cast<TensorImpl::SizesType*>(prefill_atten_mask->sizes().data()),
-      ptr->prefill_atten_mask.data(),
-      const_cast<TensorImpl::DimOrderType*>(
-          prefill_atten_mask->dim_order().data()));
-  input_tensors_[prefill_forward_name_][0].push_back(prefill_attn_mask_.get());
-  // [O]: logits
-  int logit_index = 0;
-  Result<TensorInfo> logits =
-      methods_meta[modules_.size() - 1]->output_tensor_meta(logit_index);
-  prefill_logits_ = std::make_unique<TensorImpl>(
-      logits->scalar_type(),
-      logits->sizes().size(),
-      const_cast<TensorImpl::SizesType*>(logits->sizes().data()),
-      ptr->prefill_logits.data(),
-      const_cast<TensorImpl::DimOrderType*>(logits->dim_order().data()));
-  output_tensors_[prefill_forward_name_][modules_.size() - 1].push_back(
-      prefill_logits_.get());
-
-  // [O] kv_cache
-  int index = 1;
-  // prefill_k_stride should be equal to prefill_v_stride in prefill mode.
-  // In hybrid mode, we use kv mode cache len for v stride since we want to
-  // update prefill's result onto kv modes input.
-  int32_t prefill_k_stride = prefill_cache_len_ * head_dim_;
-  int32_t prefill_v_stride =
-      std::max(prefill_cache_len_, kv_cache_len_) * head_dim_;
-
-  if (eval_mode_ == EvalMode::kPrefill) {
-    ET_CHECK_MSG(
-        prefill_k_stride == prefill_v_stride,
-        "prefill_k_stride should be equal to prefill_v_stride");
-  }
-  for (int offset = 0, shard_index = 0; shard_index < modules_.size();
-       offset += shard_layers_[shard_index], shard_index++) {
-    for (int cache_group = 0; cache_group < 2; ++cache_group) {
-      for (int layer = 0; layer < shard_layers_[shard_index]; ++layer) {
-        for (int head = 0; head < num_heads_; ++head, ++index) {
-          Result<TensorInfo> kv_cache =
-              methods_meta[shard_index]->output_tensor_meta(index);
-          std::vector<std::unique_ptr<TensorImpl>>& cache =
-              (cache_group == 0 ? k_cache_out_[prefill_forward_name_]
-                                : v_cache_out_[prefill_forward_name_]);
-          void* cache_ptr = (cache_group == 0)
-              ? static_cast<void*>(
-                    ptr->k_cache_out[layer + offset].data() +
-                    head * prefill_k_stride)
-              : static_cast<void*>(
-                    ptr->v_cache[layer + offset].data() +
-                    (head + 1) * prefill_v_stride);
-          cache.emplace_back(std::make_unique<TensorImpl>(
-              kv_cache->scalar_type(),
-              kv_cache->sizes().size(),
-              const_cast<TensorImpl::SizesType*>(kv_cache->sizes().data()),
-              cache_ptr,
-              const_cast<TensorImpl::DimOrderType*>(
-                  kv_cache->dim_order().data())));
-          output_tensors_[prefill_forward_name_][shard_index].push_back(
-              cache.back().get());
-        }
-      }
-    }
-  }
-}
-
-void HybridMemory::update_prefill_to_kv_io(
-    int64_t cur_token,
-    int64_t pos,
-    std::vector<std::vector<Tensor>>& output_tensors) {
-  ET_CHECK_MSG(kv_cache_len_ != 0, "k_cache_len_ should not equal to 0");
-  ET_CHECK_MSG(
-      prefill_cache_len_ != 0, "prefill_cache_len_ should not equal to 0");
-  IO* ptr = static_cast<IO*>(data_ptr_.get());
-
-  ptr->input_tok = static_cast<int32_t>(cur_token);
-  ptr->input_pos = static_cast<int32_t>(pos);
-  // If prompt len is 30, prefill will handle to pos = 30.
-  // At this point, pos should be 31.
-  for (int i = 0; i < pos + 1; i++) {
-    ptr->kv_attention_mask[kv_cache_len_ - i] = 0;
-  }
-
-  // update v_cache
-  std::vector<std::unique_ptr<executorch::aten::TensorImpl>>& v_cache_in =
-      v_cache_in_[kv_forward_name_];
-  std::vector<std::unique_ptr<executorch::aten::TensorImpl>>& v_cache_out =
-      v_cache_out_[kv_forward_name_];
-  for (int i = 0, v_cache_stride = head_dim_ * pos; i < v_cache_in.size();
-       i++) {
-    v_cache_in[i]->set_data(
-        v_cache_in[i]->mutable_data<uint8_t>() + v_cache_stride);
-    v_cache_out[i]->set_data(
-        v_cache_out[i]->mutable_data<uint8_t>() + v_cache_stride);
-  }
-  for (int shard = 0; shard < output_tensors.size(); shard++) {
-    for (int index = 0; index < output_tensors[shard].size(); index++) {
-      ET_CHECK_MSG(
-          modules_[shard]->set_output(
-              kv_forward_name_, output_tensors[shard][index], index) ==
-              Error::Ok,
-          "Failed to set output tensor for module %d's %d'th output "
-          "while updating kv_cache output tensors",
-          shard,
-          index);
-    }
-  }
-
-  std::vector<std::unique_ptr<executorch::aten::TensorImpl>>& k_cache_in =
-      k_cache_in_[kv_forward_name_];
-  std::vector<std::unique_ptr<executorch::aten::TensorImpl>>& k_cache_out =
-      k_cache_out_[prefill_forward_name_];
-  for (int i = 0; i < k_cache_in.size(); ++i) {
-    uint8_t* ptr_in = k_cache_in[i]->mutable_data<uint8_t>();
-    const uint8_t* ptr_out = k_cache_out[i]->data<uint8_t>();
-    for (size_t j = 0, offset = kv_cache_len_; j < head_dim_;
-         ++j, offset += kv_cache_len_) {
-      for (int k = 0, k_stride = j * prefill_cache_len_; k < pos; k++) {
-        ptr_in[offset + k] = ptr_out[k_stride + k];
-      }
-    }
-    k_cache_in[i]->set_data(ptr_in + pos);
-  }
-}
-
-void HybridMemory::update_kv_io(
-    int64_t cur_token,
-    int64_t pos,
-    std::vector<std::vector<Tensor>>& output_tensors) {
-  IO* ptr = static_cast<IO*>(data_ptr_.get());
-  // update input_tok
-  ptr->input_tok = static_cast<int32_t>(cur_token);
-  // update position_ids
-  ptr->input_pos = static_cast<int32_t>(pos);
-  // update causal mask for next token
-  ptr->kv_attention_mask[kv_cache_len_ - pos] = 65535;
-
-  // update v_cache
-  auto& v_cache_in = v_cache_in_[kv_forward_name_];
-  auto& v_cache_out = v_cache_out_[kv_forward_name_];
-  for (int i = 0; i < v_cache_in.size(); i++) {
-    v_cache_in[i]->set_data(v_cache_in[i]->mutable_data<uint8_t>() + head_dim_);
-    v_cache_out[i]->set_data(
-        v_cache_out[i]->mutable_data<uint8_t>() + head_dim_);
-  }
-
-  for (int shard = 0; shard < output_tensors.size(); shard++) {
-    for (int index = 0; index < output_tensors[shard].size(); index++) {
-      ET_CHECK_MSG(
-          modules_[shard]->set_output(
-              kv_forward_name_, output_tensors[shard][index], index) ==
-              Error::Ok,
-          "failed to set output tensor for module %d's %d'th output "
-          "while updating kv_cache output tensors",
-          shard,
-          index);
-    }
-  }
-
-  auto& k_cache_in = k_cache_in_[kv_forward_name_];
-  auto& k_cache_out = k_cache_out_[kv_forward_name_];
-  // update k_cache by single thread, this part is cpu cache sensitive
-  for (int i = 0; i < k_cache_in.size(); ++i) {
-    uint8_t* ptr_in = k_cache_in[i]->mutable_data<uint8_t>();
-    const uint8_t* ptr_out = k_cache_out[i]->data<uint8_t>();
-    for (size_t j = 0, offset = kv_cache_len_; j < head_dim_;
-         ++j, offset += kv_cache_len_) {
-      ptr_in[offset] = ptr_out[j];
-    }
-    k_cache_in[i]->set_data(ptr_in + 1);
-  }
-}
-
-void HybridMemory::update_prefill_io(
-    int64_t cur_token,
-    int64_t pos,
-    std::vector<std::vector<Tensor>>& output_tensors) {
-  (void)output_tensors;
-  IO* ptr = static_cast<IO*>(data_ptr_.get());
-  ptr->prefill_input_toks[pos] = static_cast<int32_t>(cur_token);
-}
-
-} // namespace example
diff --git a/examples/qualcomm/oss_scripts/llama3_2/runner/io_memory.h b/examples/qualcomm/oss_scripts/llama3_2/runner/io_memory.h
deleted file mode 100644
index bb107ffd77..0000000000
--- a/examples/qualcomm/oss_scripts/llama3_2/runner/io_memory.h
+++ /dev/null
@@ -1,159 +0,0 @@
-/*
- * Copyright (c) Qualcomm Innovation Center, Inc.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-#include <cstdint>
-#include <vector>
-
-#include <executorch/extension/module/module.h>
-#include <executorch/runtime/executor/method_meta.h>
-
-namespace example {
-
-enum EvalMode {
-  kPrefill = 0,
-  kKVCached,
-  kHybrid,
-  kUnsupported,
-};
-class Memory {
- public:
-  Memory(std::vector<std::shared_ptr<executorch::extension::Module>>& modules);
-  virtual ~Memory();
-  virtual void init_io() = 0;
-  virtual void prepare_prefill_io(
-      const std::vector<
-          executorch::runtime::Result<executorch::runtime::MethodMeta>>&
-          methods_meta) = 0;
-  virtual void prepare_kv_io(
-      const std::vector<
-          executorch::runtime::Result<executorch::runtime::MethodMeta>>&
-          methods_meta) = 0;
-  virtual void update_prefill_to_kv_io(
-      int64_t cur_token,
-      int64_t pos,
-      std::vector<std::vector<executorch::aten::Tensor>>& output_tensors) = 0;
-  virtual void update_kv_io(
-      int64_t cur_token,
-      int64_t pos,
-      std::vector<std::vector<executorch::aten::Tensor>>& output_tensors) = 0;
-  virtual void update_prefill_io(
-      int64_t cur_token,
-      int64_t pos,
-      std::vector<std::vector<executorch::aten::Tensor>>& output_tensors) = 0;
-  void* get_mutable_ptr();
-  std::vector<executorch::aten::Tensor> get_input_tensors(
-      int shard_index,
-      const std::string& method_name);
-  std::vector<executorch::aten::Tensor> get_output_tensors(
-      int shard_index,
-      const std::string& method_name);
-
- protected:
-  std::unique_ptr<void, void (*)(void*)> data_ptr_;
-  std::unordered_map<
-      std::string,
-      std::vector<std::vector<executorch::aten::TensorImpl*>>>
-      input_tensors_;
-  std::unordered_map<
-      std::string,
-      std::vector<std::vector<executorch::aten::TensorImpl*>>>
-      output_tensors_;
-  std::vector<std::shared_ptr<executorch::extension::Module>> modules_;
-};
-
-class HybridMemory : public Memory {
- public:
-  HybridMemory(
-      std::vector<std::shared_ptr<executorch::extension::Module>>& modules,
-      int32_t prefill_cache_len,
-      int32_t kv_cache_len,
-      int32_t vocab_size,
-      int32_t num_layers,
-      int32_t head_dim,
-      int32_t num_heads,
-      EvalMode eval_mode,
-      const std::string& prefill_forward_name,
-      const std::string& kv_forward_name);
-
-  void init_io() override;
-  void prepare_prefill_io(
-      const std::vector<
-          executorch::runtime::Result<executorch::runtime::MethodMeta>>&
-          methods_meta) override;
-  void prepare_kv_io(
-      const std::vector<
-          executorch::runtime::Result<executorch::runtime::MethodMeta>>&
-          methods_meta) override;
-  void update_prefill_to_kv_io(
-      int64_t cur_token,
-      int64_t pos,
-      std::vector<std::vector<executorch::aten::Tensor>>& output_tensors)
-      override;
-  void update_kv_io(
-      int64_t cur_token,
-      int64_t pos,
-      std::vector<std::vector<executorch::aten::Tensor>>& output_tensors)
-      override;
-  void update_prefill_io(
-      int64_t cur_token,
-      int64_t pos,
-      std::vector<std::vector<executorch::aten::Tensor>>& output_tensors)
-      override;
-  struct IO {
-    int32_t input_tok;
-    int32_t input_pos;
-    std::vector<std::vector<std::vector<uint8_t>>> k_cache;
-    std::vector<std::vector<uint8_t>> v_cache;
-    std::vector<std::vector<uint8_t>> k_cache_out;
-    std::vector<uint16_t> kv_attention_mask;
-    std::vector<uint16_t> kv_logits;
-    std::vector<int32_t> prefill_input_toks;
-    std::vector<uint16_t> prefill_atten_mask;
-    std::vector<uint16_t> prefill_logits;
-  };
-
- private:
-  std::unique_ptr<executorch::aten::TensorImpl> input_tok_;
-  std::unique_ptr<executorch::aten::TensorImpl> input_pos_;
-  std::unique_ptr<executorch::aten::TensorImpl> hidden_state_;
-  std::unique_ptr<executorch::aten::TensorImpl> attention_mask_;
-  std::unique_ptr<executorch::aten::TensorImpl> prefill_input_toks_;
-  std::unique_ptr<executorch::aten::TensorImpl> prefill_attn_mask_;
-  std::unique_ptr<executorch::aten::TensorImpl> prefill_logits_;
-  std::unordered_map<
-      std::string,
-      std::vector<std::unique_ptr<executorch::aten::TensorImpl>>>
-      k_cache_in_;
-  std::unordered_map<
-      std::string,
-      std::vector<std::unique_ptr<executorch::aten::TensorImpl>>>
-      v_cache_in_;
-  std::unordered_map<
-      std::string,
-      std::vector<std::unique_ptr<executorch::aten::TensorImpl>>>
-      k_cache_out_;
-  std::unordered_map<
-      std::string,
-      std::vector<std::unique_ptr<executorch::aten::TensorImpl>>>
-      v_cache_out_;
-  std::unique_ptr<executorch::aten::TensorImpl> kv_logits_;
-  std::vector<int> shard_layers_;
-  int32_t kv_cache_len_{0};
-  int32_t prefill_cache_len_{0};
-  int32_t vocab_size_;
-  int32_t num_layers_;
-  int32_t head_dim_;
-  int32_t num_heads_;
-  EvalMode eval_mode_;
-  std::string prefill_forward_name_;
-  std::string kv_forward_name_;
-};
-
-} // namespace example
diff --git a/examples/qualcomm/oss_scripts/llama3_2/targets.bzl b/examples/qualcomm/oss_scripts/llama3_2/targets.bzl
index 64adc7eca9..811a50629e 100644
--- a/examples/qualcomm/oss_scripts/llama3_2/targets.bzl
+++ b/examples/qualcomm/oss_scripts/llama3_2/targets.bzl
@@ -1,54 +1,2 @@
-load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "get_oss_build_kwargs", "runtime")
-load("@fbsource//xplat/executorch/backends/qualcomm/qnn_version.bzl", "get_qnn_library_verision")
-
 def define_common_targets():
-    runtime.cxx_library(
-        name = "runner_lib",
-        srcs = glob(
-            [
-                "runner/*.cpp",
-            ],
-        ),
-        exported_headers = glob([
-            "runner/*.h",
-        ]),
-        compiler_flags = [
-            "-Wno-global-constructors",
-            "-Wunused-command-line-argument",
-        ],
-        deps = [
-            "//executorch/extension/llm/runner:stats",
-            "//executorch/extension/tensor:tensor",
-            "fbsource//third-party/qualcomm/qnn/qnn-{0}:api".format(get_qnn_library_verision()),
-        ],
-        exported_deps = [
-            "//executorch/extension/module:module",
-            "//executorch/extension/llm/sampler:sampler",
-            "//executorch/examples/models/llama/tokenizer:tiktoken",
-            "//executorch/extension/llm/tokenizer:bpe_tokenizer",
-            "//executorch/extension/evalue_util:print_evalue",
-            "//executorch/backends/qualcomm/runtime:runtime",
-        ],
-        external_deps = [
-            "gflags",
-        ],
-        **get_oss_build_kwargs()
-    )
-
-    runtime.cxx_binary(
-        name = "qnn_llama3_2_runner",
-        srcs = [
-            "qnn_llama3_2_runner.cpp",
-        ],
-        compiler_flags = [
-            "-Wno-global-constructors",
-        ],
-        deps = [
-            ":runner_lib",
-            "//executorch/extension/threadpool:threadpool", # this depeneency shouldn't be needed. But it fails to build..
-        ],
-        external_deps = [
-            "gflags",
-        ],
-        **get_oss_build_kwargs()
-    )
+    return None
diff --git a/examples/qualcomm/utils.py b/examples/qualcomm/utils.py
index 23e384dee1..1ba15969e0 100755
--- a/examples/qualcomm/utils.py
+++ b/examples/qualcomm/utils.py
@@ -256,7 +256,7 @@ def build_executorch_binary(
     shared_buffer=False,
     metadata=None,
     dump_intermediate_outputs=False,
-    custom_pass_config=frozenset(),
+    passes_job=None,
     qat_training_data=None,
 ):
     """
@@ -296,9 +296,9 @@ def build_executorch_binary(
             annotated_model = ptq_calibrate(captured_model, quantizer, dataset)
 
         quantized_model = convert_pt2e(annotated_model)
-        edge_prog = capture_program(quantized_model, inputs, custom_pass_config)
+        edge_prog = capture_program(quantized_model, inputs, passes_job)
     else:
-        edge_prog = capture_program(model, inputs, custom_pass_config)
+        edge_prog = capture_program(model, inputs, passes_job)
 
     backend_options = generate_htp_compiler_spec(
         use_fp16=False if quant_dtype else True
diff --git a/exir/passes/insert_write_back_for_buffers_pass.py b/exir/passes/insert_write_back_for_buffers_pass.py
index 1ddbf98e7e..d9a2acafb4 100644
--- a/exir/passes/insert_write_back_for_buffers_pass.py
+++ b/exir/passes/insert_write_back_for_buffers_pass.py
@@ -100,19 +100,23 @@ def insert_write_back_for_buffers_pass(
             input_name_to_node[lifted_node] = input_node
 
     # Grab the mutable buffer nodes in the outputs,
-    mutated_outputs: List[Optional[str]] = [
-        (
-            out_spec.target
-            if out_spec.kind
+    mutated_outputs: List[Optional[str]] = []
+    for out_spec in ep.graph_signature.output_specs:
+        # if the output arg is the input value then all operations on it are in-place
+        # so there's no need to add a copy_ node
+        if (
+            out_spec.kind
             in (OutputKind.BUFFER_MUTATION, OutputKind.USER_INPUT_MUTATION)
-            and out_spec.arg.name
-            not in {
-                val.name for val in input_name_to_node.values()
-            }  # if the output arg is the input value then all operations on it are in-place so theres no need to add a copy_ node
-            else None
-        )
-        for out_spec in ep.graph_signature.output_specs
-    ]
+            and
+            # explicitly check if target exists (it should always be there)
+            out_spec.target in input_name_to_node
+            and
+            # if the arg and target are not the same, we add a copy_ node.
+            out_spec.arg.name != input_name_to_node[out_spec.target].name
+        ):
+            mutated_outputs.append(out_spec.target)
+        else:
+            mutated_outputs.append(None)
 
     # insert the copy ops and update the outputs
     buffer_output_nodes = _insert_copy(gm, mutated_outputs, input_name_to_node)
diff --git a/exir/passes/sym_shape_eval_pass.py b/exir/passes/sym_shape_eval_pass.py
index 4ba554c6a1..de606917c7 100644
--- a/exir/passes/sym_shape_eval_pass.py
+++ b/exir/passes/sym_shape_eval_pass.py
@@ -278,7 +278,7 @@ def call(self, graph_module: GraphModule):
                             not isinstance(s, int) for s in concrete_stride
                         ):
                             raise RuntimeError(
-                                f"Cannot evalute the shape upper bound of a dynamic-shaped tensor to a concrete bounded integer. Got tensor spec: {spec}."
+                                f"Cannot evaluate the shape upper bound of a dynamic-shaped tensor to a concrete bounded integer. Got tensor spec: {spec}."
                                 f"The upper bound shape we get {concrete_shape}, the upper bound stride we get {concrete_stride}"
                                 "This tensor could either be from 1. a data-dependent operation such as nonzero. Or 2. an input, whose don't have a constraint for the upper bound."
                                 "Please use export's constrain_as_size() or constrain_as_value() apis and set a concrete upper bound to resolve this."
diff --git a/exir/program/_program.py b/exir/program/_program.py
index 86f111f2f9..fdf4b93e19 100644
--- a/exir/program/_program.py
+++ b/exir/program/_program.py
@@ -1,5 +1,6 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
+# Copyright 2025 Arm Limited and/or its affiliates.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
@@ -10,7 +11,7 @@
 import io
 import logging
 import os
-from typing import Any, Dict, List, Optional, Sequence, Set, TextIO, Tuple, Union
+from typing import Any, Dict, List, Optional, Sequence, Set, TextIO, Tuple, Type, Union
 
 import torch
 import torch._export
@@ -66,6 +67,7 @@
 )
 from executorch.extension.flat_tensor.serialize.serialize import FlatTensorSerializer
 from torch._export.passes import ReplaceViewOpsWithViewCopyOpsPass
+from torch._export.verifier import Verifier
 from torch.export import ExportedProgram
 from torch.export._remove_auto_functionalized_pass import (
     unsafe_remove_auto_functionalized_pass,
@@ -213,21 +215,29 @@ def _transform(self, *passes: PassType) -> "ExportedProgram":
     if transformed_gm is self.graph_module and not res.modified:
         return self
 
+    return _update_exported_program_graph_module(self, transformed_gm)
+
+
+def _update_exported_program_graph_module(
+    exported_program: ExportedProgram,
+    gm: torch.fx.GraphModule,
+    override_verifiers: None | list[Type[Verifier]] = None,
+) -> "ExportedProgram":
     transformed_ep = ExportedProgram(
-        root=transformed_gm,
-        graph=transformed_gm.graph,
+        root=gm,
+        graph=gm.graph,
         graph_signature=_get_updated_graph_signature(
-            self.graph_signature, transformed_gm
+            exported_program.graph_signature, gm
         ),
-        state_dict=self.state_dict,
-        range_constraints=_get_updated_range_constraints(transformed_gm),
-        module_call_graph=copy.deepcopy(self._module_call_graph),
-        example_inputs=self.example_inputs,
-        constants=self.constants,
-        verifiers=[self.verifier],
+        state_dict=exported_program.state_dict,
+        range_constraints=_get_updated_range_constraints(gm),
+        module_call_graph=copy.deepcopy(exported_program._module_call_graph),
+        example_inputs=exported_program.example_inputs,
+        constants=exported_program.constants,
+        verifiers=override_verifiers or [exported_program.verifier],
     )
-    transformed_ep.graph_module.meta.update(self.graph_module.meta)
-    transformed_ep.graph_module.meta.update(res.graph_module.meta)
+    transformed_ep.graph_module.meta.update(exported_program.graph_module.meta)
+    transformed_ep.graph_module.meta.update(gm.meta)
     return transformed_ep
 
 
diff --git a/exir/program/test/test_program.py b/exir/program/test/test_program.py
index 046ad03e75..d5e0d15d4a 100644
--- a/exir/program/test/test_program.py
+++ b/exir/program/test/test_program.py
@@ -313,6 +313,45 @@ def forward(self, x, y):
         )
         edge_manager.to_executorch()
 
+    def test_data_dependent(self):
+        with torch.library._scoped_library("mylib", "FRAGMENT") as lib:
+            torch.library.define(
+                "mylib::foo1",
+                "(Tensor a, Tensor b) -> Tensor",
+                tags=torch.Tag.pt2_compliant_tag,
+                lib=lib,
+            )
+
+            @torch.library.impl("mylib::foo1", "cpu", lib=lib)
+            def foo_impl(a, b):
+                return a + b
+
+            @torch.library.register_fake("mylib::foo1", lib=lib)
+            def mylib_foo_default_fake(*args, **kwargs):
+                ctx = torch.library.get_ctx()
+                fake_shape = ctx.new_dynamic_size()
+                return torch.empty(fake_shape, dtype=torch.float32, device="cpu")
+
+            class M(torch.nn.Module):
+                def forward(self, a, b, c):
+                    res = torch.ops.mylib.foo1(a, b)
+
+                    c_item = c.item()
+                    torch._check_is_size(c_item)
+                    torch._check(c_item < res.shape[0])
+                    return res[:c_item]
+
+            inp = (torch.randn(10), torch.randn(10), torch.tensor(3))
+
+            ep = export(M(), inp)
+            edge = to_edge(ep)
+            self.assertTrue(
+                torch.allclose(
+                    edge.exported_program().module()(*inp),
+                    M()(*inp),
+                )
+            )
+
     def test_edge_manager_transform(self):
         edge_manager: EdgeProgramManager = to_edge(
             get_exported_programs(), get_config_methods()
diff --git a/exir/tests/test_arg_validator.py b/exir/tests/test_arg_validator.py
index d85ef81b90..ede8b22432 100644
--- a/exir/tests/test_arg_validator.py
+++ b/exir/tests/test_arg_validator.py
@@ -64,7 +64,7 @@ def forward(self, x):
             ops.edge.aten._log_softmax.default.name(),
         )
         self.assertDictEqual(
-            validator.violating_ops[key],
+            validator.violating_ops[key][0],
             {
                 "self": torch.bfloat16,
                 "__ret_0": torch.bfloat16,
diff --git a/exir/tests/test_passes.py b/exir/tests/test_passes.py
index 0583b6d163..8e40c49e33 100644
--- a/exir/tests/test_passes.py
+++ b/exir/tests/test_passes.py
@@ -1291,36 +1291,41 @@ class MutableStateModule(torch.nn.Module):
             def __init__(self):
                 super().__init__()
                 self.register_buffer("state", torch.zeros(1))
+                self.register_buffer("direct_copy_from_input", torch.zeros(1))
 
             def forward(self, x):
                 y = x + self.state
                 self.state.add_(1)
+                self.direct_copy_from_input.copy_(x)
                 return y
 
         model = to_edge(export(MutableStateModule(), (torch.zeros(1),), strict=True))
         self.assertEqual(count_copies(model.exported_program().graph_module), 0)
         # Before
         # graph():
-        #     %arg0_1 : [num_users=2] = placeholder[target=arg0_1]
-        #     %_lifted_tensor_constant1 : [num_users=1] = placeholder[target=_lifted_tensor_constant1]
-        #     %arg1_1 : [num_users=1] = placeholder[target=arg1_1]
-        #     %aten_add_tensor : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.add.Tensor](args = (%arg1_1, %arg0_1), kwargs = {})
-        #     %aten__to_copy_default : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten._to_copy.default](args = (%_lifted_tensor_constant1,), kwargs = {dtype: torch.float32})
-        #     %aten_add_tensor_1 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.add.Tensor](args = (%arg0_1, %aten__to_copy_default), kwargs = {})
-        #     return (aten_add_tensor_1, aten_add_tensor)
+        #     %b_state : [num_users=2] = placeholder[target=b_state]
+        #     %b_direct_copy_from_input : [num_users=0] = placeholder[target=b_direct_copy_from_input]
+        #     %_lifted_tensor_constant2 : [num_users=1] = placeholder[target=_lifted_tensor_constant2]
+        #     %x : [num_users=2] = placeholder[target=x]
+        #     %aten_add_tensor : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.add.Tensor](args = (%x, %b_state), kwargs = {})
+        #     %dim_order_ops__to_dim_order_copy_default : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.dim_order_ops._to_dim_order_copy.default](args = (%_lifted_tensor_constant2,), kwargs = {dtype: torch.float32, dim_order: []})
+        #     %aten_add_tensor_1 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.add.Tensor](args = (%b_state, %dim_order_ops__to_dim_order_copy_default), kwargs = {})
+        #     return (aten_add_tensor_1, x, aten_add_tensor)
         gm, _ = insert_write_back_for_buffers_pass(model.exported_program())
 
         # After
         # graph():
-        #     %arg0_1 : [num_users=3] = placeholder[target=arg0_1]
-        #     %_lifted_tensor_constant1 : [num_users=1] = placeholder[target=_lifted_tensor_constant1]
-        #     %arg1_1 : [num_users=1] = placeholder[target=arg1_1]
-        #     %aten_add_tensor : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.add.Tensor](args = (%arg1_1, %arg0_1), kwargs = {})
-        #     %aten__to_copy_default : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten._to_copy.default](args = (%_lifted_tensor_constant1,), kwargs = {dtype: torch.float32})
-        #     %aten_add_tensor_1 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.add.Tensor](args = (%arg0_1, %aten__to_copy_default), kwargs = {})
-        #     %copy__default : [num_users=1] = call_function[target=torch.ops.aten.copy_.default](args = (%arg0_1, %aten_add_tensor_1), kwargs = {})
-        #     return (copy__default, aten_add_tensor)
-        self.assertEqual(count_copies(gm), 1)
+        #     %b_state : [num_users=3] = placeholder[target=b_state]
+        #     %b_direct_copy_from_input : [num_users=1] = placeholder[target=b_direct_copy_from_input]
+        #     %_lifted_tensor_constant2 : [num_users=1] = placeholder[target=_lifted_tensor_constant2]
+        #     %x : [num_users=2] = placeholder[target=x]
+        #     %aten_add_tensor : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.add.Tensor](args = (%x, %b_state), kwargs = {})
+        #     %dim_order_ops__to_dim_order_copy_default : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.dim_order_ops._to_dim_order_copy.default](args = (%_lifted_tensor_constant2,), kwargs = {dtype: torch.float32, dim_order: []})
+        #     %aten_add_tensor_1 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.add.Tensor](args = (%b_state, %dim_order_ops__to_dim_order_copy_default), kwargs = {})
+        #     %copy__default : [num_users=1] = call_function[target=torch.ops.aten.copy_.default](args = (%b_state, %aten_add_tensor_1), kwargs = {})
+        #     %copy__default_1 : [num_users=1] = call_function[target=torch.ops.aten.copy_.default](args = (%b_direct_copy_from_input, %x), kwargs = {})
+        #     return (copy__default, copy__default_1, aten_add_tensor)
+        self.assertEqual(count_copies(gm), 2)
 
     def test_remove_quantized_op_noop_pass(self) -> None:
         class TestAddSliceNoop(torch.nn.Module):
diff --git a/exir/verification/arg_validator.py b/exir/verification/arg_validator.py
index c087944b12..53e2d36d39 100644
--- a/exir/verification/arg_validator.py
+++ b/exir/verification/arg_validator.py
@@ -37,9 +37,9 @@ class EdgeOpArgValidator(torch.fx.Interpreter):
 
     def __init__(self, graph_module: torch.fx.GraphModule) -> None:
         super().__init__(graph_module)
-        self.violating_ops: Dict[EdgeOpOverload, Dict[str, Optional[torch.dtype]]] = (
-            defaultdict(dict)
-        )
+        self.violating_ops: Dict[
+            EdgeOpOverload, Tuple[Dict[str, Optional[torch.dtype]], torch.fx.Node]
+        ] = defaultdict(dict)
 
     def run_node(self, n: torch.fx.Node) -> None:
         self.node = n
@@ -125,5 +125,5 @@ def call_function(  # noqa: C901  # pyre-fixme[14]
 
         valid = target._schema.dtype_constraint.validate(tensor_arg_types)
         if not valid:
-            self.violating_ops[target] = tensor_arg_types
+            self.violating_ops[target] = (tensor_arg_types, self.node)
         return super().call_function(target, args, kwargs)  # pyre-fixme[6]
diff --git a/exir/verification/test/test_verifier.py b/exir/verification/test/test_verifier.py
index 369f976076..f38072969a 100644
--- a/exir/verification/test/test_verifier.py
+++ b/exir/verification/test/test_verifier.py
@@ -36,40 +36,6 @@ def test_edge_verifier_check_valid_op_succeed_given_custom_op(self) -> None:
             verifier.check_valid_edge_op(edge_op)
             verifier.check_valid_op(edge_op)
 
-    def test_edge_verifier_enablement(self) -> None:
-        class M(torch.nn.Module):
-            def forward(self, x, y):
-                z = y.item()
-                torch._check(z > 0)
-                torch._check(z < 4)
-                return x[z : z + y.shape[0]]
-
-        ep = torch.export.export(M(), (torch.randn(10), torch.tensor([3])), strict=True)
-
-        compile_config_with_disable_ir_validity = EdgeCompileConfig(
-            _check_ir_validity=False
-        )
-        edge_manager = to_edge(
-            ep, compile_config=compile_config_with_disable_ir_validity
-        )
-
-        normal_verifier = EXIREdgeDialectVerifier()
-        disable_ir_validity_verifier = EXIREdgeDialectVerifier(
-            compile_config_with_disable_ir_validity
-        )
-
-        # exported model can not pass normal verifier due to
-        # aten.sym_constrain_range.default is illegal to be edge op
-        with self.assertRaises(SpecViolationError):
-            normal_verifier(edge_manager.exported_program())
-
-        # exported model can pass disable_ir_validity_verifier due to verifier
-        # is disabled by compile_config_with_disable_ir_validity
-        # (_check_ir_validity=False). Noted that this verifation has been done
-        # when calling `to_edge`. Explicitly calling verifier here just for better
-        # demonstration and is unnecessary in real world for ir verification.
-        disable_ir_validity_verifier(edge_manager.exported_program())
-
     def test_edge_verifier_check_edge_op(self) -> None:
         class Model(torch.nn.Module):
             def __init__(self):
diff --git a/exir/verification/verifier.py b/exir/verification/verifier.py
index 2ad453ffed..bc510ff684 100644
--- a/exir/verification/verifier.py
+++ b/exir/verification/verifier.py
@@ -16,6 +16,8 @@
 from executorch.exir.error import ExportError, ExportErrorType
 from executorch.exir.lowered_backend_module import LoweredBackendModule
 from executorch.exir.passes.dim_order_ops_registry import DimOrderOpsMap
+from executorch.exir.passes.executorch_prim_ops_registry import _EXECUTORCH_SYM_OPS
+from executorch.exir.passes.replace_aten_with_edge_pass import DISALLOW_LIST
 from executorch.exir.verification.arg_validator import (
     EdgeOpArgValidator,
     RunHigherOrderOperatorError,
@@ -99,16 +101,20 @@ def __init__(self) -> None:
             self._exception_list = exception_list if exception_list else []
 
         def _get_exception_list(self) -> List[torch._ops.OpOverload]:
-            exception_list = [
-                torch.ops.aten.mkldnn_rnn_layer.default,
-                torch.ops.aten._upsample_bilinear2d_aa.default,
-                torch.ops.aten.quantize_per_tensor.default,
-                torch.ops.aten.dequantize.self,
-                torch.ops.aten.max.default,  # TODO(T188268054)
-                torch.ops.aten.min.default,  # TODO(T188268054)
-                torch.ops.aten.full_like.default,  # TODO(T183507359)
-            ]
-            exception_list += self._exception_list
+            exception_list = (
+                [
+                    torch.ops.aten.mkldnn_rnn_layer.default,
+                    torch.ops.aten._upsample_bilinear2d_aa.default,
+                    torch.ops.aten.quantize_per_tensor.default,
+                    torch.ops.aten.dequantize.self,
+                    torch.ops.aten.max.default,  # TODO(T188268054)
+                    torch.ops.aten.min.default,  # TODO(T188268054)
+                    torch.ops.aten.full_like.default,  # TODO(T183507359)
+                ]
+                + list(_EXECUTORCH_SYM_OPS)
+                + DISALLOW_LIST
+                + self._exception_list
+            )
 
             return exception_list
 
@@ -189,9 +195,15 @@ def _check_tensor_args_matching_op_allowed_dtype(gm: GraphModule) -> None:
         return
 
     if validator.violating_ops:
+        error_msg = ""
+        for op, node in validator.violating_ops.items():
+            # error_msg += f"#####################################################\n"
+            error_msg += f"\nOperator: {op} with args: {node[0]}\n"
+            error_msg += f"stack trace: {node[1].stack_trace}\n"
+            # error_msg += f"#####################################################\n"
         raise SpecViolationError(
-            f"These operators are taking Tensor inputs with mismatched dtypes: {validator.violating_ops}"
-            "Please make sure the dtypes of the Tensor inputs are the same as the dtypes of the corresponding "
+            f"These operators are taking Tensor inputs with mismatched dtypes:\n{error_msg}"
+            "Please make sure the dtypes of the Tensor inputs are the same as the dtypes of the corresponding outputs."
         )
 
 
@@ -243,13 +255,9 @@ def check_valid_edge_op(self, op):
                 return
             if (
                 op
-                in [
-                    operator.getitem,
-                    torch.ops.aten.sym_size.int,
-                    torch.ops.aten.scalar_tensor.default,
-                    torch.ops.aten._assert_async.msg,
-                    torch.ops.aten._assert_scalar.default,
-                ]
+                in [operator.getitem]
+                + DISALLOW_LIST
+                + list(_EXECUTORCH_SYM_OPS)
                 + self._exception_list
             ):
                 return
diff --git a/extension/android/jni/jni_layer.cpp b/extension/android/jni/jni_layer.cpp
index ddba8462b9..551307b495 100644
--- a/extension/android/jni/jni_layer.cpp
+++ b/extension/android/jni/jni_layer.cpp
@@ -43,10 +43,10 @@ class TensorHybrid : public facebook::jni::HybridClass<TensorHybrid> {
   constexpr static const char* kJavaDescriptor =
       "Lorg/pytorch/executorch/Tensor;";
 
-  explicit TensorHybrid(exec_aten::Tensor tensor) {}
+  explicit TensorHybrid(executorch::aten::Tensor tensor) {}
 
   static facebook::jni::local_ref<TensorHybrid::javaobject>
-  newJTensorFromTensor(const exec_aten::Tensor& tensor) {
+  newJTensorFromTensor(const executorch::aten::Tensor& tensor) {
     // Java wrapper currently only supports contiguous tensors.
 
     const auto scalarType = tensor.scalar_type();
@@ -54,7 +54,7 @@ class TensorHybrid : public facebook::jni::HybridClass<TensorHybrid> {
     if (scalar_type_to_java_dtype.count(scalarType) == 0) {
       facebook::jni::throwNewJavaException(
           facebook::jni::gJavaLangIllegalArgumentException,
-          "exec_aten::Tensor scalar type %d is not supported on java side",
+          "executorch::aten::Tensor scalar type %d is not supported on java side",
           scalarType);
     }
     int jdtype = scalar_type_to_java_dtype.at(scalarType);
@@ -175,7 +175,7 @@ class JEValue : public facebook::jni::JavaClass<JEValue> {
       const auto rank = jshape->size();
 
       const auto shapeArr = jshape->getRegion(0, rank);
-      std::vector<exec_aten::SizesType> shape_vec;
+      std::vector<executorch::aten::SizesType> shape_vec;
       shape_vec.reserve(rank);
 
       auto numel = 1;
diff --git a/extension/aten_util/aten_bridge.cpp b/extension/aten_util/aten_bridge.cpp
index f13047ab8f..90fa9fbf48 100644
--- a/extension/aten_util/aten_bridge.cpp
+++ b/extension/aten_util/aten_bridge.cpp
@@ -15,7 +15,7 @@ namespace executorch {
 namespace extension {
 
 namespace {
-void check_tensor_meta(const at::Tensor& a, const exec_aten::Tensor& b) {
+void check_tensor_meta(const at::Tensor& a, const executorch::aten::Tensor& b) {
   // Check sizes/strides pointers
   ET_CHECK_MSG(
       b.sizes().data() != nullptr, "ETensor must have valid sizes array");
diff --git a/extension/benchmark/apple/Benchmark/Frameworks/download_frameworks.sh b/extension/benchmark/apple/Benchmark/Frameworks/download_frameworks.sh
index bb903c8d08..6cd1a56a0f 100755
--- a/extension/benchmark/apple/Benchmark/Frameworks/download_frameworks.sh
+++ b/extension/benchmark/apple/Benchmark/Frameworks/download_frameworks.sh
@@ -5,7 +5,7 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-VERSION="0.4.0.20241120"
+VERSION="0.5.0.20250130"
 FRAMEWORKS=(
   "backend_coreml"
   "backend_mps"
diff --git a/extension/data_loader/mmap_data_loader.cpp b/extension/data_loader/mmap_data_loader.cpp
index ebe74f9526..e990117d58 100644
--- a/extension/data_loader/mmap_data_loader.cpp
+++ b/extension/data_loader/mmap_data_loader.cpp
@@ -140,7 +140,7 @@ void MunmapSegment(void* context, void* data, size_t size) {
     // do about it.
     ET_LOG(
         Error,
-        "munmap(0x%zx, %zu) failed: %s (ignored)",
+        "munmap(0x%zx, %zu) failed: %s (%d) (ignored)",
         (size_t)range.start,
         range.size,
         ::strerror(errno),
diff --git a/extension/evalue_util/print_evalue.cpp b/extension/evalue_util/print_evalue.cpp
index 7d29b5780c..5d8da39d73 100644
--- a/extension/evalue_util/print_evalue.cpp
+++ b/extension/evalue_util/print_evalue.cpp
@@ -16,7 +16,7 @@
 #include <ostream>
 #include <sstream>
 
-using exec_aten::ScalarType;
+using executorch::aten::ScalarType;
 
 namespace executorch {
 namespace extension {
@@ -75,7 +75,7 @@ void print_double(std::ostream& os, double value) {
 template <class T>
 void print_scalar_list(
     std::ostream& os,
-    exec_aten::ArrayRef<T> list,
+    executorch::aten::ArrayRef<T> list,
     bool print_length = true,
     bool elide_inner_items = true) {
   long edge_items = elide_inner_items ? get_stream_edge_items(os)
@@ -104,7 +104,7 @@ void print_scalar_list(
       // We've printed a full line, so wrap and begin a new one.
       os << "\n  ";
     }
-    os << executorch::runtime::EValue(exec_aten::Scalar(list[i]));
+    os << executorch::runtime::EValue(executorch::aten::Scalar(list[i]));
     if (wrapping || i < list.size() - 1) {
       // No trailing comma when not wrapping. Always a trailing comma when
       // wrapping. This will leave a trailing space at the end of every wrapped
@@ -137,7 +137,7 @@ void print_scalar_list(
   os << "]";
 }
 
-void print_tensor(std::ostream& os, exec_aten::Tensor tensor) {
+void print_tensor(std::ostream& os, executorch::aten::Tensor tensor) {
   os << "tensor(sizes=";
   // Always print every element of the sizes list.
   print_scalar_list(
@@ -155,7 +155,7 @@ void print_tensor(std::ostream& os, exec_aten::Tensor tensor) {
   case ScalarType::dtype:                                    \
     print_scalar_list(                                       \
         os,                                                  \
-        exec_aten::ArrayRef<ctype>(                          \
+        executorch::aten::ArrayRef<ctype>(                   \
             tensor.const_data_ptr<ctype>(), tensor.numel()), \
         /*print_length=*/false);                             \
     break;
@@ -172,7 +172,7 @@ void print_tensor(std::ostream& os, exec_aten::Tensor tensor) {
 
 void print_tensor_list(
     std::ostream& os,
-    exec_aten::ArrayRef<exec_aten::Tensor> list) {
+    executorch::aten::ArrayRef<executorch::aten::Tensor> list) {
   os << "(len=" << list.size() << ")[";
   for (size_t i = 0; i < list.size(); ++i) {
     if (list.size() > 1) {
@@ -191,7 +191,8 @@ void print_tensor_list(
 
 void print_list_optional_tensor(
     std::ostream& os,
-    exec_aten::ArrayRef<exec_aten::optional<exec_aten::Tensor>> list) {
+    executorch::aten::ArrayRef<
+        executorch::aten::optional<executorch::aten::Tensor>> list) {
   os << "(len=" << list.size() << ")[";
   for (size_t i = 0; i < list.size(); ++i) {
     if (list.size() > 1) {
diff --git a/extension/evalue_util/test/print_evalue_test.cpp b/extension/evalue_util/test/print_evalue_test.cpp
index b1e2bf75f4..03aebaf789 100644
--- a/extension/evalue_util/test/print_evalue_test.cpp
+++ b/extension/evalue_util/test/print_evalue_test.cpp
@@ -20,9 +20,9 @@
 #include <gtest/gtest.h>
 
 using namespace ::testing;
-using exec_aten::ArrayRef;
-using exec_aten::Scalar;
-using exec_aten::ScalarType;
+using executorch::aten::ArrayRef;
+using executorch::aten::Scalar;
+using executorch::aten::ScalarType;
 using torch::executor::BoxedEvalueList;
 using torch::executor::EValue;
 using torch::executor::testing::TensorFactory;
@@ -47,12 +47,12 @@ TEST(PrintEvalueTest, None) {
 //
 
 TEST(PrintEvalueTest, TrueBool) {
-  EValue value(exec_aten::Scalar(true));
+  EValue value(executorch::aten::Scalar(true));
   expect_output(value, "True");
 }
 
 TEST(PrintEvalueTest, FalseBool) {
-  EValue value(exec_aten::Scalar(false));
+  EValue value(executorch::aten::Scalar(false));
   expect_output(value, "False");
 }
 
@@ -61,17 +61,17 @@ TEST(PrintEvalueTest, FalseBool) {
 //
 
 TEST(PrintEvalueTest, ZeroInt) {
-  EValue value(exec_aten::Scalar(0));
+  EValue value(executorch::aten::Scalar(0));
   expect_output(value, "0");
 }
 
 TEST(PrintEvalueTest, PositiveInt) {
-  EValue value(exec_aten::Scalar(10));
+  EValue value(executorch::aten::Scalar(10));
   expect_output(value, "10");
 }
 
 TEST(PrintEvalueTest, NegativeInt) {
-  EValue value(exec_aten::Scalar(-10));
+  EValue value(executorch::aten::Scalar(-10));
   expect_output(value, "-10");
 }
 
@@ -79,7 +79,7 @@ TEST(PrintEvalueTest, LargePositiveInt) {
   // A value that can't fit in 32 bits. Saying Scalar(<literal-long-long>) is
   // ambiguous with c10::Scalar, so use a non-literal value.
   constexpr int64_t i = 1152921504606846976;
-  EValue value = {exec_aten::Scalar(i)};
+  EValue value = {executorch::aten::Scalar(i)};
   expect_output(value, "1152921504606846976");
 }
 
@@ -87,7 +87,7 @@ TEST(PrintEvalueTest, LargeNegativeInt) {
   // A value that can't fit in 32 bits. Saying Scalar(<literal-long-long>) is
   // ambiguous with c10::Scalar, so use a non-literal value.
   constexpr int64_t i = -1152921504606846976;
-  EValue value = {exec_aten::Scalar(i)};
+  EValue value = {executorch::aten::Scalar(i)};
   expect_output(value, "-1152921504606846976");
 }
 
@@ -96,52 +96,52 @@ TEST(PrintEvalueTest, LargeNegativeInt) {
 //
 
 TEST(PrintEvalueTest, ZeroDouble) {
-  EValue value(exec_aten::Scalar(0.0));
+  EValue value(executorch::aten::Scalar(0.0));
   expect_output(value, "0.");
 }
 
 TEST(PrintEvalueTest, PositiveZeroDouble) {
-  EValue value(exec_aten::Scalar(+0.0));
+  EValue value(executorch::aten::Scalar(+0.0));
   expect_output(value, "0.");
 }
 
 TEST(PrintEvalueTest, NegativeZeroDouble) {
-  EValue value(exec_aten::Scalar(-0.0));
+  EValue value(executorch::aten::Scalar(-0.0));
   expect_output(value, "-0.");
 }
 
 TEST(PrintEvalueTest, PositiveIntegralDouble) {
-  EValue value(exec_aten::Scalar(10.0));
+  EValue value(executorch::aten::Scalar(10.0));
   expect_output(value, "10.");
 }
 
 TEST(PrintEvalueTest, PositiveFractionalDouble) {
-  EValue value(exec_aten::Scalar(10.1));
+  EValue value(executorch::aten::Scalar(10.1));
   expect_output(value, "10.1");
 }
 
 TEST(PrintEvalueTest, NegativeIntegralDouble) {
-  EValue value(exec_aten::Scalar(-10.0));
+  EValue value(executorch::aten::Scalar(-10.0));
   expect_output(value, "-10.");
 }
 
 TEST(PrintEvalueTest, NegativeFractionalDouble) {
-  EValue value(exec_aten::Scalar(-10.1));
+  EValue value(executorch::aten::Scalar(-10.1));
   expect_output(value, "-10.1");
 }
 
 TEST(PrintEvalueTest, PositiveInfinityDouble) {
-  EValue value((exec_aten::Scalar(INFINITY)));
+  EValue value((executorch::aten::Scalar(INFINITY)));
   expect_output(value, "inf");
 }
 
 TEST(PrintEvalueTest, NegativeInfinityDouble) {
-  EValue value((exec_aten::Scalar(-INFINITY)));
+  EValue value((executorch::aten::Scalar(-INFINITY)));
   expect_output(value, "-inf");
 }
 
 TEST(PrintEvalueTest, NaNDouble) {
-  EValue value((exec_aten::Scalar(NAN)));
+  EValue value((executorch::aten::Scalar(NAN)));
   expect_output(value, "nan");
 }
 
@@ -487,9 +487,10 @@ void expect_tensor_list_output(size_t num_tensors, const char* expected) {
   // Tensor entries. It's important not to destroy these entries, because the
   // values list will own the underlying Tensors.
   auto unwrapped_values_memory = std::make_unique<uint8_t[]>(
-      sizeof(exec_aten::Tensor) * wrapped_values.size());
-  exec_aten::Tensor* unwrapped_values =
-      reinterpret_cast<exec_aten::Tensor*>(unwrapped_values_memory.get());
+      sizeof(executorch::aten::Tensor) * wrapped_values.size());
+  executorch::aten::Tensor* unwrapped_values =
+      reinterpret_cast<executorch::aten::Tensor*>(
+          unwrapped_values_memory.get());
 #if USE_ATEN_LIB
   // Must be initialized because BoxedEvalueList will use operator=() on each
   // entry. But we can't do this in non-ATen mode because
@@ -500,7 +501,7 @@ void expect_tensor_list_output(size_t num_tensors, const char* expected) {
 #endif
 
   ASSERT_LE(num_tensors, wrapped_values.size());
-  BoxedEvalueList<exec_aten::Tensor> list(
+  BoxedEvalueList<executorch::aten::Tensor> list(
       wrapped_values.data(), unwrapped_values, num_tensors);
   EValue value(list);
   expect_output(value, expected);
@@ -565,18 +566,20 @@ void expect_list_optional_tensor_output(
   // optional<Tensor> entries. It's important not to destroy these entries,
   // because the values list will own the underlying Tensors.
   auto unwrapped_values_memory = std::make_unique<uint8_t[]>(
-      sizeof(exec_aten::optional<exec_aten::Tensor>) * wrapped_values.size());
-  exec_aten::optional<exec_aten::Tensor>* unwrapped_values =
-      reinterpret_cast<exec_aten::optional<exec_aten::Tensor>*>(
+      sizeof(executorch::aten::optional<executorch::aten::Tensor>) *
+      wrapped_values.size());
+  executorch::aten::optional<executorch::aten::Tensor>* unwrapped_values =
+      reinterpret_cast<executorch::aten::optional<executorch::aten::Tensor>*>(
           unwrapped_values_memory.get());
   // Must be initialized because BoxedEvalueList will use operator=() on each
   // entry.
   for (int i = 0; i < wrapped_values.size(); ++i) {
-    new (&unwrapped_values[i]) exec_aten::optional<exec_aten::Tensor>();
+    new (&unwrapped_values[i])
+        executorch::aten::optional<executorch::aten::Tensor>();
   }
 
   ASSERT_LE(num_tensors, wrapped_values.size());
-  BoxedEvalueList<exec_aten::optional<exec_aten::Tensor>> list(
+  BoxedEvalueList<executorch::aten::optional<executorch::aten::Tensor>> list(
       wrapped_values.data(), unwrapped_values, num_tensors);
   EValue value(list);
   expect_output(value, expected);
@@ -929,9 +932,10 @@ TEST(PrintEvalueTest, WrappedTensorLists) {
   // Tensor entries. It's important not to destroy these entries, because the
   // values list will own the underlying Tensors.
   auto unwrapped_values_memory = std::make_unique<uint8_t[]>(
-      sizeof(exec_aten::Tensor) * wrapped_values.size());
-  exec_aten::Tensor* unwrapped_values =
-      reinterpret_cast<exec_aten::Tensor*>(unwrapped_values_memory.get());
+      sizeof(executorch::aten::Tensor) * wrapped_values.size());
+  executorch::aten::Tensor* unwrapped_values =
+      reinterpret_cast<executorch::aten::Tensor*>(
+          unwrapped_values_memory.get());
 #if USE_ATEN_LIB
   // Must be initialized because BoxedEvalueList will use operator=() on each
   // entry. But we can't do this in non-ATen mode because
@@ -942,7 +946,7 @@ TEST(PrintEvalueTest, WrappedTensorLists) {
 #endif
 
   // Demonstrate the formatting when printing a list with multiple tensors.
-  BoxedEvalueList<exec_aten::Tensor> list(
+  BoxedEvalueList<executorch::aten::Tensor> list(
       wrapped_values.data(), unwrapped_values, wrapped_values.size());
   EValue value(list);
 
diff --git a/extension/export_util/TARGETS b/extension/export_util/TARGETS
index b6219d2ff6..0ec85fda63 100644
--- a/extension/export_util/TARGETS
+++ b/extension/export_util/TARGETS
@@ -12,6 +12,7 @@ runtime.python_library(
         "//bento/...",
         "//bento_kernels/...",
         "//executorch/...",
+        "@EXECUTORCH_CLIENTS",
     ],
     deps = [
         "//caffe2:torch",
diff --git a/extension/flat_tensor/serialize/flat_tensor_header.cpp b/extension/flat_tensor/serialize/flat_tensor_header.cpp
index fe1db318e9..b329015e4c 100644
--- a/extension/flat_tensor/serialize/flat_tensor_header.cpp
+++ b/extension/flat_tensor/serialize/flat_tensor_header.cpp
@@ -76,7 +76,8 @@ uint64_t GetUInt64LE(const uint8_t* data) {
   if (size < FlatTensorHeader::kNumHeadBytes) {
     return Error::InvalidArgument;
   }
-  const uint8_t* header = reinterpret_cast<const uint8_t*>(data);
+  const uint8_t* header =
+      reinterpret_cast<const uint8_t*>(data) + kHeaderOffset;
 
   // Check magic bytes.
   if (std::memcmp(
diff --git a/extension/flat_tensor/serialize/flat_tensor_header.h b/extension/flat_tensor/serialize/flat_tensor_header.h
index 909d85e63a..e722cd749b 100644
--- a/extension/flat_tensor/serialize/flat_tensor_header.h
+++ b/extension/flat_tensor/serialize/flat_tensor_header.h
@@ -23,6 +23,12 @@ struct FlatTensorHeader {
    */
   static constexpr size_t kNumHeadBytes = 64;
 
+  /**
+   * The offset into the serialized FlatTensor data where the FlatTensor
+   * header should begin.
+   */
+  static constexpr size_t kHeaderOffset = 8;
+
   /**
    * The magic bytes that identify the header. This should be in sync with
    * the magic in executorch/extension/flat_tensor/serialize/serialize.py
diff --git a/extension/flat_tensor/serialize/serialize.cpp b/extension/flat_tensor/serialize/serialize.cpp
index 06b9f7f0d2..11f67c9b16 100644
--- a/extension/flat_tensor/serialize/serialize.cpp
+++ b/extension/flat_tensor/serialize/serialize.cpp
@@ -43,7 +43,7 @@ void write_nulls(std::ostream& out, size_t num_bytes) {
 
 runtime::Error save_ptd(
     const std::string& path,
-    const std::map<std::string, exec_aten::Tensor>& tensor_map,
+    const std::map<std::string, executorch::aten::Tensor>& tensor_map,
     const size_t tensor_alignment) {
   // Create File
   std::ofstream file;
@@ -55,7 +55,7 @@ runtime::Error save_ptd(
 
 runtime::Error save_ptd(
     std::ostream& out,
-    const std::map<std::string, exec_aten::Tensor>& tensor_map,
+    const std::map<std::string, executorch::aten::Tensor>& tensor_map,
     const size_t tensor_alignment) {
   // Assert the system is little endian. Since we are sending the data over
   // the wire, we need to ensure that the data is always in the same format.
@@ -109,7 +109,8 @@ runtime::Error save_ptd(
       tensor_alignment,
       builder.CreateVector(tensors),
       builder.CreateVector(buffers));
-  builder.Finish(flat_tensor); // Our flatbuffer is created now.
+  builder.Finish(flat_tensor, ::flat_tensor_flatbuffer::FlatTensorIdentifier());
+  // Our flatbuffer is created now.
 
   // Calculate flatbuffer padding.
   auto padded_flatbufer_size =
@@ -117,6 +118,30 @@ runtime::Error save_ptd(
   auto padded_header_size =
       aligned_size(FlatTensorHeader::kHeaderExpectedLength, tensor_alignment);
 
+  // The general structure of the file is:
+  // [flatbuffer offset to root table][flatbuffer file indentifier]
+  //   [FlatTensorHeader][padding][flatbuffer contents][padding]
+  //   [segment data].
+  // This means we first serialize the first 8 bytes of the flatbuffer,
+  // updating the offset to the root table, then the header, then the
+  // flatbuffer. We are embedding the header inside the flatbuffer doing
+  // this which allows us to continue using flatbuffer tools directly on the
+  // .ptd file.
+
+  // Calculate new offset to root table.
+  uint32_t current_offset =
+      *reinterpret_cast<uint32_t*>(builder.GetBufferPointer());
+  uint32_t new_offset = current_offset + padded_header_size;
+
+  // Write flatbuffer offset to root table
+  out.write(reinterpret_cast<const char*>(&new_offset), sizeof(new_offset));
+
+  // Write flatbuffer magic bytes
+  out.write(
+      reinterpret_cast<const char*>(builder.GetBufferPointer()) +
+          sizeof(new_offset),
+      4); // This is the file identifier from flat_tensor.fbs.
+
   // Write header
   out.write(FlatTensorHeader::kMagic, sizeof(FlatTensorHeader::kMagic));
   out.write(
@@ -149,10 +174,11 @@ runtime::Error save_ptd(
       padding_required(
           FlatTensorHeader::kHeaderExpectedLength, tensor_alignment));
 
-  // Write flatbuffer
+  // Write flatbuffer, offset by 8 bytes (4-byte root table offset + 4-byte
+  // file identifier) since we wrote those before the FlatTensorHeader.
   out.write(
-      reinterpret_cast<const char*>(builder.GetBufferPointer()),
-      builder.GetSize());
+      reinterpret_cast<const char*>(builder.GetBufferPointer()) + 8,
+      builder.GetSize() - 8);
 
   // Write flatbuffer padding
   write_nulls(out, padding_required(builder.GetSize(), tensor_alignment));
diff --git a/extension/flat_tensor/serialize/serialize.h b/extension/flat_tensor/serialize/serialize.h
index a1183cbaf7..759fc8d455 100644
--- a/extension/flat_tensor/serialize/serialize.h
+++ b/extension/flat_tensor/serialize/serialize.h
@@ -32,7 +32,7 @@ constexpr uint32_t kSchemaVersion = 0;
  */
 ET_EXPERIMENTAL runtime::Error save_ptd(
     const std::string& path,
-    const std::map<std::string, exec_aten::Tensor>& tensor_map,
+    const std::map<std::string, executorch::aten::Tensor>& tensor_map,
     const size_t tensor_alignment);
 
 /**
@@ -45,7 +45,7 @@ ET_EXPERIMENTAL runtime::Error save_ptd(
  */
 ET_EXPERIMENTAL runtime::Error save_ptd(
     std::ostream& out,
-    const std::map<std::string, exec_aten::Tensor>& tensor_map,
+    const std::map<std::string, executorch::aten::Tensor>& tensor_map,
     const size_t tensor_alignment);
 
 } // namespace flat_tensor
diff --git a/extension/flat_tensor/serialize/serialize.py b/extension/flat_tensor/serialize/serialize.py
index 22374345cf..abdccc17de 100644
--- a/extension/flat_tensor/serialize/serialize.py
+++ b/extension/flat_tensor/serialize/serialize.py
@@ -17,6 +17,7 @@
 from executorch.exir._serialize._dataclass import _DataclassEncoder, _json_to_dataclass
 
 from executorch.exir._serialize._flatbuffer import _flatc_compile, _flatc_decompile
+from executorch.exir._serialize._program import _insert_flatbuffer_header
 from executorch.exir._serialize.data_serializer import DataPayload, DataSerializer
 
 from executorch.exir._serialize.padding import aligned_size, pad_to, padding_required
@@ -197,6 +198,17 @@ def to_bytes(self) -> bytes:
         return data
 
 
+def _get_extended_header(flat_tensor_data: bytes) -> Optional[FlatTensorHeader]:
+    """Returns the extended header of the flat_tensor data, if present and valid."""
+    try:
+        eh = FlatTensorHeader.from_bytes(flat_tensor_data[8:])
+        if eh.is_valid():
+            return eh
+    except ValueError:
+        pass
+    return None
+
+
 class FlatTensorSerializer(DataSerializer):
     """A concrete implementation of the DataSerializer interface that
     serializes and deserializes data to/from the FlatTensor format.
@@ -299,14 +311,29 @@ def serialize(
 
         # Pad header and payload to segment alignment.
         header_data = pad_to(header_data, padded_header_length)
+        original_flatbuffer_payload_size = len(flatbuffer_payload)
         flatbuffer_payload.append(
             b"\x00" * (padded_flatbuffer_length - len(flatbuffer_payload))
         )
+        injected_flatbuffer_data: bytes = _insert_flatbuffer_header(
+            flatbuffer_data=flatbuffer_payload.__bytes__(),
+            magic_regex=r"FT[0-9a-zA-Z][0-9a-zA-Z]",
+            header_data=header_data,
+        )
+
+        eh = _get_extended_header(injected_flatbuffer_data)
+        assert eh is not None
+        assert eh.flatbuffer_size == original_flatbuffer_payload_size
+        assert eh.segment_base_offset == segment_base_offset
+        assert eh.flatbuffer_offset == padded_header_length
+        assert eh.segment_data_size == len(flat_tensor_data)
+
+        del header_data
+        del flatbuffer_payload
 
         # Place everything into one segment.
         payload = Cord()
-        payload.append(header_data)
-        payload.append(flatbuffer_payload)
+        payload.append(injected_flatbuffer_data)
         payload.append(flat_tensor_data)
 
         return payload
diff --git a/extension/flat_tensor/test/flat_tensor_header_test.cpp b/extension/flat_tensor/test/flat_tensor_header_test.cpp
index d8d9568651..4a0341691e 100644
--- a/extension/flat_tensor/test/flat_tensor_header_test.cpp
+++ b/extension/flat_tensor/test/flat_tensor_header_test.cpp
@@ -68,7 +68,7 @@ std::vector<uint8_t> CreateExampleFlatTensorHeader() {
   memset(ret.data(), 0x55, ret.size());
   // Copy the example header into the right offset.
   memcpy(
-      ret.data(),
+      ret.data() + FlatTensorHeader::kHeaderOffset,
       kExampleHeaderData,
       sizeof(kExampleHeaderData));
   return ret;
diff --git a/extension/flat_tensor/test/test_serialize.cpp b/extension/flat_tensor/test/test_serialize.cpp
index ddb25857d5..5013d75622 100644
--- a/extension/flat_tensor/test/test_serialize.cpp
+++ b/extension/flat_tensor/test/test_serialize.cpp
@@ -35,7 +35,7 @@ class FlatTensorSerializeTest : public ::testing::Test {
 
 TEST_F(FlatTensorSerializeTest, ValidFlatTensorSerialized) {
   const size_t kTensorAlignment = 16;
-  std::map<std::string, exec_aten::Tensor> flat_tensor_map;
+  std::map<std::string, executorch::aten::Tensor> flat_tensor_map;
 
   float linear_weight = 3.14f;
   auto weight = executorch::extension::make_tensor_ptr({1}, &linear_weight);
@@ -53,35 +53,46 @@ TEST_F(FlatTensorSerializeTest, ValidFlatTensorSerialized) {
   auto x = buf.str();
   const char* byte_buffer = x.c_str();
 
-  // Check Magic
-  EXPECT_EQ(byte_buffer[0], 'F');
-  EXPECT_EQ(byte_buffer[1], 'H');
-  EXPECT_EQ(byte_buffer[2], '0');
-  EXPECT_EQ(byte_buffer[3], '1');
+  // First 4 bytes are an offset to the flatbuffer root table.
+
+  // Check magic ids.
+  EXPECT_EQ(byte_buffer[4], 'F');
+  EXPECT_EQ(byte_buffer[5], 'T');
+  ASSERT_EQ(byte_buffer[6], '0');
+  ASSERT_EQ(byte_buffer[7], '1');
+
+  ASSERT_EQ(byte_buffer[8], 'F');
+  ASSERT_EQ(byte_buffer[9], 'H');
+  EXPECT_EQ(byte_buffer[10], '0');
+  EXPECT_EQ(byte_buffer[11], '1');
 
   // Check Header
-  EXPECT_EQ( // Header length
-      *(uint32_t*)(byte_buffer + 4),
+  auto header_buffer = byte_buffer + 8;
+  EXPECT_EQ( // Check expected length
+      *(uint32_t*)(header_buffer + 4),
       executorch::extension::FlatTensorHeader::kHeaderExpectedLength);
+
   EXPECT_EQ(
-      *(uint64_t*)(byte_buffer + 8),
-      48); // Flatbuffer offset, header is 40 bytes + 8 bytes of padding today,
-           // and then the flatbuffer starts.
+      *(uint64_t*)(header_buffer + 8),
+      48); // Flatbuffer offset, header is 40 bytes + 8 bytes of padding
+           // today, and then the flatbuffer starts.
+
   EXPECT_EQ(
-      *(uint64_t*)(byte_buffer + 16),
-      224); // Flatbuffer size, This is fragile, and depends on the schema, the
-            // builder, and the padding needed.
-  const uint64_t segment_offset = 48 +
-      224; // Segment offset, depends on the padded header and flatbuffer sizes.
-  EXPECT_EQ(*(uint64_t*)(byte_buffer + 24), segment_offset);
+      *(uint64_t*)(header_buffer + 16),
+      232); // Flatbuffer size. This is fragile, and depends on the schema,
+            // the builder, and the padding needed.
+
+  // Segment offset, depends on the padded header and flatbuffer sizes.
+  const uint64_t segment_offset = 48 + 232 + 8; // 8 is padding.
+  EXPECT_EQ(*(uint64_t*)(header_buffer + 24), segment_offset);
 
   EXPECT_EQ(
-      *(uint64_t*)(byte_buffer + 32),
+      *(uint64_t*)(header_buffer + 32),
       20); // Segment total size, 8 bytes of data (2 floats), 24 bytes of
            // padding.
 
   // Check Flatbuffer
-  auto flat_tensor = ::flat_tensor_flatbuffer::GetFlatTensor(byte_buffer + 48);
+  auto flat_tensor = ::flat_tensor_flatbuffer::GetFlatTensor(byte_buffer);
 
   EXPECT_EQ(
       flat_tensor->version(),
diff --git a/extension/flat_tensor/test/test_serialize.py b/extension/flat_tensor/test/test_serialize.py
index 57dbdb8c19..d32eac1a72 100644
--- a/extension/flat_tensor/test/test_serialize.py
+++ b/extension/flat_tensor/test/test_serialize.py
@@ -80,7 +80,7 @@ def test_serialize(self) -> None:
 
         # Check header.
         header = FlatTensorHeader.from_bytes(
-            serialized_data[0 : FlatTensorHeader.EXPECTED_LENGTH]
+            serialized_data[8 : FlatTensorHeader.EXPECTED_LENGTH + 8]
         )
         self.assertTrue(header.is_valid())
 
@@ -107,15 +107,13 @@ def test_serialize(self) -> None:
 
         # Confirm the flatbuffer magic is present.
         self.assertEqual(
-            serialized_data[
-                header.flatbuffer_offset + 4 : header.flatbuffer_offset + 8
-            ],
+            serialized_data[4:8],
             b"FT01",
         )
 
         # Check flat tensor data.
         flat_tensor_bytes = serialized_data[
-            header.flatbuffer_offset : header.flatbuffer_offset + header.flatbuffer_size
+            0 : header.flatbuffer_offset + header.flatbuffer_size
         ]
 
         flat_tensor = _deserialize_to_flat_tensor(flat_tensor_bytes)
diff --git a/extension/kernel_util/test/make_boxed_from_unboxed_functor_test.cpp b/extension/kernel_util/test/make_boxed_from_unboxed_functor_test.cpp
index dce3694d51..eb7bf288b7 100644
--- a/extension/kernel_util/test/make_boxed_from_unboxed_functor_test.cpp
+++ b/extension/kernel_util/test/make_boxed_from_unboxed_functor_test.cpp
@@ -15,11 +15,11 @@
 #include <gtest/gtest.h>
 
 using namespace ::testing;
-using exec_aten::ArrayRef;
-using exec_aten::optional;
-using exec_aten::ScalarType;
-using exec_aten::Tensor;
-using exec_aten::TensorImpl;
+using executorch::aten::ArrayRef;
+using executorch::aten::optional;
+using executorch::aten::ScalarType;
+using executorch::aten::Tensor;
+using executorch::aten::TensorImpl;
 using executorch::runtime::BoxedEvalueList;
 using executorch::runtime::Error;
 using executorch::runtime::EValue;
diff --git a/extension/llm/custom_ops/op_fast_hadamard_transform_aten.cpp b/extension/llm/custom_ops/op_fast_hadamard_transform_aten.cpp
index d2e4c01d25..146ac3cc29 100644
--- a/extension/llm/custom_ops/op_fast_hadamard_transform_aten.cpp
+++ b/extension/llm/custom_ops/op_fast_hadamard_transform_aten.cpp
@@ -14,7 +14,7 @@
 namespace torch::executor::native {
 namespace {
 Tensor& fast_hadamard_transform_out_no_context(const Tensor& vec, Tensor& out) {
-  exec_aten::RuntimeContext context;
+  executorch::aten::RuntimeContext context;
   return fast_hadamard_transform_out(context, vec, out);
 }
 at::Tensor fast_hadamard_transform_aten(const at::Tensor& vec) {
diff --git a/extension/llm/custom_ops/op_sdpa.cpp b/extension/llm/custom_ops/op_sdpa.cpp
index 50fa933833..d23572d8d0 100644
--- a/extension/llm/custom_ops/op_sdpa.cpp
+++ b/extension/llm/custom_ops/op_sdpa.cpp
@@ -79,7 +79,7 @@ inline double calculate_scale(const Tensor& query, optional<double> scale) {
 
 } // namespace util
 namespace vec = ::executorch::vec;
-using Tensor = exec_aten::Tensor;
+using Tensor = ::executorch::aten::Tensor;
 
 namespace {
 
@@ -727,23 +727,23 @@ void update_cache(
   ET_CHECK_MSG(cache_data, "cache data is null");
 
   auto cache_strides = cache.strides();
-  exec_aten::StridesType cache_batch_dim_stride = cache_strides[0];
-  exec_aten::StridesType cache_seq_dim_stride = cache_strides[1];
+  ::executorch::aten::StridesType cache_batch_dim_stride = cache_strides[0];
+  ::executorch::aten::StridesType cache_seq_dim_stride = cache_strides[1];
 
   auto value_strides = projected_value.strides();
-  exec_aten::StridesType value_batch_dim_stride = value_strides[0];
+  ::executorch::aten::StridesType value_batch_dim_stride = value_strides[0];
 
-  exec_aten::SizesType num_bytes_to_copy =
+  ::executorch::aten::SizesType num_bytes_to_copy =
       (projected_value.numel() / projected_value.size(0)) *
       projected_value.element_size();
 
   for (int64_t batch_line = 0; batch_line < projected_value.size(0);
        ++batch_line) {
-    exec_aten::SizesType cache_pos_offset =
+    ::executorch::aten::SizesType cache_pos_offset =
         (batch_line * cache_batch_dim_stride +
          start_pos * cache_seq_dim_stride) *
         cache.element_size();
-    exec_aten::SizesType value_pos_offset =
+    ::executorch::aten::SizesType value_pos_offset =
         (batch_line * value_batch_dim_stride) * cache.element_size();
 
     std::memcpy(
@@ -863,14 +863,14 @@ Tensor& custom_sdpa_out(
 
   // Refactor the following into create_view util perhaps using
   // TensorPtr
-  std::array<exec_aten::DimOrderType, util::kKVDim> sliced_key_dim_order{
-      0, 1, 2, 3};
-  std::array<exec_aten::SizesType, util::kKVDim> sliced_key_sizes;
+  std::array<::executorch::aten::DimOrderType, util::kKVDim>
+      sliced_key_dim_order{0, 1, 2, 3};
+  std::array<::executorch::aten::SizesType, util::kKVDim> sliced_key_sizes;
   sliced_key_sizes[0] = k.size(0);
   sliced_key_sizes[1] = start_pos + seq_len; // key_cache.size(2);
   sliced_key_sizes[2] = k.size(2);
   sliced_key_sizes[3] = k.size(3);
-  std::array<exec_aten::StridesType, util::kKVDim> sliced_key_strides;
+  std::array<::executorch::aten::StridesType, util::kKVDim> sliced_key_strides;
   dim_order_to_stride_nocheck(
       sliced_key_sizes.data(),
       sliced_key_dim_order.data(),
@@ -889,14 +889,15 @@ Tensor& custom_sdpa_out(
       TensorShapeDynamism::STATIC);
   Tensor sliced_key_cache(&k_impl);
 
-  std::array<exec_aten::DimOrderType, util::kKVDim> sliced_value_dim_order{
-      0, 1, 2, 3};
-  std::array<exec_aten::SizesType, util::kKVDim> sliced_value_sizes;
+  std::array<::executorch::aten::DimOrderType, util::kKVDim>
+      sliced_value_dim_order{0, 1, 2, 3};
+  std::array<::executorch::aten::SizesType, util::kKVDim> sliced_value_sizes;
   sliced_value_sizes[0] = v.size(0);
   sliced_value_sizes[1] = start_pos + seq_len; // value_cache.size(2);
   sliced_value_sizes[2] = v.size(2);
   sliced_value_sizes[3] = v.size(3);
-  std::array<exec_aten::StridesType, util::kKVDim> sliced_value_strides;
+  std::array<::executorch::aten::StridesType, util::kKVDim>
+      sliced_value_strides;
   dim_order_to_stride_nocheck(
       sliced_value_sizes.data(),
       sliced_value_dim_order.data(),
diff --git a/extension/llm/custom_ops/op_sdpa_aot.cpp b/extension/llm/custom_ops/op_sdpa_aot.cpp
index 581979afd9..32141a9cef 100644
--- a/extension/llm/custom_ops/op_sdpa_aot.cpp
+++ b/extension/llm/custom_ops/op_sdpa_aot.cpp
@@ -95,7 +95,7 @@ Tensor& custom_sdpa_out_no_context(
     // @lint-ignore CLANGTIDY facebook-hte-ParameterMightThrowOnCopy
     const optional<double> scale,
     Tensor& output) {
-  exec_aten::RuntimeContext context{};
+  executorch::aten::RuntimeContext context{};
   return torch::executor::native::custom_sdpa_out(
       context,
       q,
@@ -132,7 +132,7 @@ Tensor& update_cache_out_no_context(
     Tensor& cache,
     const int64_t start_pos,
     Tensor& output) {
-  exec_aten::RuntimeContext context{};
+  executorch::aten::RuntimeContext context{};
   return torch::executor::native::update_cache_out(
       context, value, cache, start_pos, output);
 }
diff --git a/extension/llm/custom_ops/op_sdpa_test.cpp b/extension/llm/custom_ops/op_sdpa_test.cpp
index 7d7a35b4f9..d11cff0967 100644
--- a/extension/llm/custom_ops/op_sdpa_test.cpp
+++ b/extension/llm/custom_ops/op_sdpa_test.cpp
@@ -19,15 +19,15 @@
 using namespace ::testing;
 using executorch::runtime::testing::TensorFactory;
 
-exec_aten::Tensor op_scaled_dot_product_attention(
-    const exec_aten::Tensor& query,
-    const exec_aten::Tensor& key,
-    const exec_aten::Tensor& value,
-    const exec_aten::optional<exec_aten::Tensor>& attn_mask,
+executorch::aten::Tensor op_scaled_dot_product_attention(
+    const executorch::aten::Tensor& query,
+    const executorch::aten::Tensor& key,
+    const executorch::aten::Tensor& value,
+    const executorch::aten::optional<executorch::aten::Tensor>& attn_mask,
     double dropout_p,
     bool is_causal,
-    exec_aten::optional<double> scale,
-    exec_aten::Tensor& out) {
+    executorch::aten::optional<double> scale,
+    executorch::aten::Tensor& out) {
   executorch::runtime::KernelRuntimeContext context{};
   return torch::executor::native::flash_attention_kernel_out(
       context, query, key, value, attn_mask, dropout_p, is_causal, scale, out);
@@ -38,9 +38,9 @@ Most tests are generated by FACTO
 */
 
 TEST(OpScaledDotProductAttentionTest, CorrectnessTest_105) {
-  TensorFactory<exec_aten::ScalarType::Float> tfFloat;
+  TensorFactory<executorch::aten::ScalarType::Float> tfFloat;
 
-  exec_aten::Tensor query = tfFloat.make(
+  executorch::aten::Tensor query = tfFloat.make(
       {1, 1, 4, 4},
       {0.4320,
        0.1461,
@@ -58,7 +58,7 @@ TEST(OpScaledDotProductAttentionTest, CorrectnessTest_105) {
        0.2772,
        0.3965,
        0.1101});
-  exec_aten::Tensor key = tfFloat.make(
+  executorch::aten::Tensor key = tfFloat.make(
       {1, 1, 4, 4},
       {0.4951,
        0.1630,
@@ -76,7 +76,7 @@ TEST(OpScaledDotProductAttentionTest, CorrectnessTest_105) {
        0.2270,
        0.1862,
        0.2762});
-  exec_aten::Tensor value = tfFloat.make(
+  executorch::aten::Tensor value = tfFloat.make(
       {1, 1, 4, 4},
       {0.2914,
        0.4977,
@@ -94,11 +94,11 @@ TEST(OpScaledDotProductAttentionTest, CorrectnessTest_105) {
        0.7338,
        0.2203,
        0.6971});
-  exec_aten::optional<exec_aten::Tensor> attn_mask;
+  executorch::aten::optional<executorch::aten::Tensor> attn_mask;
   double dropout_p = 0;
   bool is_causal = false;
-  exec_aten::optional<double> scale;
-  exec_aten::Tensor ret_expected = tfFloat.make(
+  executorch::aten::optional<double> scale;
+  executorch::aten::Tensor ret_expected = tfFloat.make(
       {1, 1, 4, 4},
       {0.4473,
        0.5221,
@@ -117,74 +117,74 @@ TEST(OpScaledDotProductAttentionTest, CorrectnessTest_105) {
        0.2444,
        0.6570});
   std::vector<int32_t> out_size = {1, 1, 4, 4};
-  exec_aten::Tensor out = tfFloat.zeros(out_size);
-  exec_aten::Tensor ret = op_scaled_dot_product_attention(
+  executorch::aten::Tensor out = tfFloat.zeros(out_size);
+  executorch::aten::Tensor ret = op_scaled_dot_product_attention(
       query, key, value, attn_mask, dropout_p, is_causal, scale, out);
   EXPECT_TENSOR_CLOSE_WITH_TOL(ret, ret_expected, 1e-4, 1e-4);
 }
 
 TEST(OpScaledDotProductAttentionTest, CorrectnessTest_11) {
-  TensorFactory<exec_aten::ScalarType::Float> tfFloat;
+  TensorFactory<executorch::aten::ScalarType::Float> tfFloat;
 
-  exec_aten::Tensor query = tfFloat.make(
+  executorch::aten::Tensor query = tfFloat.make(
       {1, 1, 1, 8},
       {75.25, -32.875, -96.375, 75.0, -5.25, -30.0, 71.5, -70.875});
-  exec_aten::Tensor key = tfFloat.make(
+  executorch::aten::Tensor key = tfFloat.make(
       {1, 1, 1, 8},
       {50.125, 18.0, 72.625, -95.0, 47.25, -74.875, -46.375, -47.0});
-  exec_aten::Tensor value = tfFloat.make(
+  executorch::aten::Tensor value = tfFloat.make(
       {1, 1, 1, 8},
       {99.375, 80.125, -81.0, 8.5, -70.375, -54.25, -80.25, 34.125});
-  exec_aten::optional<exec_aten::Tensor> attn_mask =
-      exec_aten::optional<exec_aten::Tensor>(
+  executorch::aten::optional<executorch::aten::Tensor> attn_mask =
+      executorch::aten::optional<executorch::aten::Tensor>(
           tfFloat.full({1, 1}, std::numeric_limits<float>::infinity()));
   double dropout_p = 0.0;
   bool is_causal = false;
-  exec_aten::optional<double> scale;
+  executorch::aten::optional<double> scale;
   std::vector<int32_t> out_size(query.sizes().begin(), query.sizes().end());
-  exec_aten::Tensor out = tfFloat.zeros(out_size);
+  executorch::aten::Tensor out = tfFloat.zeros(out_size);
   // Pytorch says these should be NAN
   // but output is -NAN. Both are really the same though
-  exec_aten::Tensor ret_expected = tfFloat.make(
+  executorch::aten::Tensor ret_expected = tfFloat.make(
       {1, 1, 1, 8}, {-NAN, -NAN, -NAN, -NAN, -NAN, -NAN, -NAN, -NAN});
-  exec_aten::Tensor ret = op_scaled_dot_product_attention(
+  executorch::aten::Tensor ret = op_scaled_dot_product_attention(
       query, key, value, attn_mask, dropout_p, is_causal, scale, out);
   EXPECT_TENSOR_CLOSE(ret, ret_expected);
 }
 
 TEST(OpScaledDotProductAttentionTest, CorrectnessTest_13) {
-  TensorFactory<exec_aten::ScalarType::Float> tfFloat;
+  TensorFactory<executorch::aten::ScalarType::Float> tfFloat;
 
-  exec_aten::Tensor query = tfFloat.make(
+  executorch::aten::Tensor query = tfFloat.make(
       {1, 8, 1, 1}, {-47.0, 21.25, 74.75, 46.375, 21.0, -29.0, 2.625, 83.125});
-  exec_aten::Tensor key = tfFloat.make(
+  executorch::aten::Tensor key = tfFloat.make(
       {1, 8, 3, 1},
       {-43.0,  12.5,    -68.125, -3.25,  -10.0,  65.0,    49.75,   -83.125,
        97.125, -40.375, -5.5,    93.125, 70.875, -67.375, -44.875, 98.25,
        -76.25, -74.5,   -23.25,  -66.75, 42.625, -88.0,   -37.75,  -61.625});
-  exec_aten::Tensor value = tfFloat.make(
+  executorch::aten::Tensor value = tfFloat.make(
       {1, 8, 3, 1},
       {65.0,   81.125,  8.125,  68.375, -54.25, -1.125, -73.25, -54.0,
        -28.75, -23.875, 49.0,   63.5,   96.375, 16.625, 79.5,   33.125,
        32.875, -73.75,  69.125, 7.25,   -35.0,  94.0,   6.75,   65.75});
-  exec_aten::optional<exec_aten::Tensor> attn_mask;
+  executorch::aten::optional<executorch::aten::Tensor> attn_mask;
   double dropout_p = 0.0;
   bool is_causal = true;
-  exec_aten::optional<double> scale;
+  executorch::aten::optional<double> scale;
   std::vector<int32_t> out_size(query.sizes().begin(), query.sizes().end());
-  exec_aten::Tensor out = tfFloat.zeros(out_size);
-  exec_aten::Tensor ret_expected = tfFloat.make(
+  executorch::aten::Tensor out = tfFloat.zeros(out_size);
+  executorch::aten::Tensor ret_expected = tfFloat.make(
       {1, 8, 1, 1},
       {65.0, 68.375, -73.25, -23.875, 96.375, 33.125, 69.125, 94.0});
-  exec_aten::Tensor ret = op_scaled_dot_product_attention(
+  executorch::aten::Tensor ret = op_scaled_dot_product_attention(
       query, key, value, attn_mask, dropout_p, is_causal, scale, out);
   EXPECT_TENSOR_CLOSE(ret, ret_expected);
 }
 
 TEST(OpScaledDotProductAttentionTest, CorrectnessTest_17) {
-  TensorFactory<exec_aten::ScalarType::Float> tfFloat;
+  TensorFactory<executorch::aten::ScalarType::Float> tfFloat;
 
-  exec_aten::Tensor query = tfFloat.make(
+  executorch::aten::Tensor query = tfFloat.make(
       {3, 2, 2, 6},
       {69.625,  -98.125, -22.0,   -17.25, -75.625, -43.875, -74.75,  14.5,
        82.0,    -82.625, 25.125,  -98.0,  -91.5,   65.875,  23.0,    50.25,
@@ -195,7 +195,7 @@ TEST(OpScaledDotProductAttentionTest, CorrectnessTest_17) {
        -17.125, -69.5,   26.375,  25.5,   -51.625, 32.5,    15.0,    65.5,
        -49.0,   -71.25,  -18.625, -82.0,  94.25,   -56.25,  2.0,     21.25,
        37.125,  -9.0,    65.0,    -86.75, -77.0,   -26.75,  -99.875, -8.5});
-  exec_aten::Tensor key = tfFloat.make(
+  executorch::aten::Tensor key = tfFloat.make(
       {3, 2, 4, 6},
       {98.125,  -86.25,  25.25,   -33.125, -98.0,   -42.5,   44.75,   42.375,
        -68.625, -97.375, 70.625,  0.75,    51.375,  89.75,   -62.5,   0.5,
@@ -215,7 +215,7 @@ TEST(OpScaledDotProductAttentionTest, CorrectnessTest_17) {
        59.375,  -69.625, -47.25,  -11.5,   -8.5,    -90.875, -64.75,  -61.75,
        97.0,    1.75,    -17.375, 99.875,  -85.375, 6.25,    41.625,  5.75,
        78.375,  -50.75,  9.75,    36.875,  84.5,    19.625,  -83.75,  17.0});
-  exec_aten::Tensor value = tfFloat.make(
+  executorch::aten::Tensor value = tfFloat.make(
       {3, 2, 4, 6},
       {-26.375, -65.0,   55.5,    37.0,    90.0,    54.25,   83.75,   -33.75,
        2.375,   99.5,    71.5,    70.5,    -3.625,  -30.875, 46.125,  -60.5,
@@ -235,11 +235,11 @@ TEST(OpScaledDotProductAttentionTest, CorrectnessTest_17) {
        -23.625, 85.875,  -25.875, 57.625,  50.75,   76.625,  -72.5,   26.0,
        65.875,  13.125,  -19.625, 7.5,     -25.5,   40.25,   75.25,   -48.0,
        8.25,    5.125,   42.375,  23.75,   65.25,   -77.0,   35.625,  -12.0});
-  exec_aten::optional<exec_aten::Tensor> attn_mask;
+  executorch::aten::optional<executorch::aten::Tensor> attn_mask;
   double dropout_p = 0.0;
   bool is_causal = false;
-  exec_aten::optional<double> scale;
-  exec_aten::Tensor ret_expected = tfFloat.make(
+  executorch::aten::optional<double> scale;
+  executorch::aten::Tensor ret_expected = tfFloat.make(
       {3, 2, 2, 6},
       {-26.375, -65.0,   55.5,    37.0,    90.0,    54.25,   83.75,   -33.75,
        2.375,   99.5,    71.5,    70.5,    -28.625, -82.375, 14.5,    -85.875,
@@ -251,16 +251,16 @@ TEST(OpScaledDotProductAttentionTest, CorrectnessTest_17) {
        37.125,  -83.625, 60.875,  -98.125, -23.625, 85.875,  -25.875, 57.625,
        50.75,   76.625,  -23.625, 85.875,  -25.875, 57.625,  50.75,   76.625});
   std::vector<int32_t> out_size(query.sizes().begin(), query.sizes().end());
-  exec_aten::Tensor out = tfFloat.zeros(out_size);
-  exec_aten::Tensor ret = op_scaled_dot_product_attention(
+  executorch::aten::Tensor out = tfFloat.zeros(out_size);
+  executorch::aten::Tensor ret = op_scaled_dot_product_attention(
       query, key, value, attn_mask, dropout_p, is_causal, scale, out);
   EXPECT_TENSOR_CLOSE(ret, ret_expected);
 }
 
 TEST(OpScaledDotProductAttentionTest, CorrectnessTest_18) {
-  TensorFactory<exec_aten::ScalarType::Float> tfFloat;
+  TensorFactory<executorch::aten::ScalarType::Float> tfFloat;
 
-  exec_aten::Tensor query = tfFloat.make(
+  executorch::aten::Tensor query = tfFloat.make(
       {3, 2, 2, 6},
       {44.0,    -13.875, -10.125, 36.625,  72.875,  -45.0,   87.5,    -5.375,
        25.25,   -28.625, 8.75,    -95.125, -75.5,   -59.25,  2.25,    -5.75,
@@ -271,7 +271,7 @@ TEST(OpScaledDotProductAttentionTest, CorrectnessTest_18) {
        -6.5,    23.5,    -84.875, -13.25,  4.875,   -2.125,  -56.25,  85.75,
        44.5,    -78.75,  -39.875, 31.0,    -73.125, 68.875,  -42.625, 29.75,
        35.125,  83.0,    29.625,  89.75,   64.875,  91.875,  40.375,  -92.75});
-  exec_aten::Tensor key = tfFloat.make(
+  executorch::aten::Tensor key = tfFloat.make(
       {3, 2, 4, 6},
       {-11.375, -70.5,   10.125,  -76.125, -26.5,   -11.375, -1.125,  7.5,
        94.375,  -50.125, 43.125,  61.75,   39.375,  -79.25,  41.375,  88.75,
@@ -291,7 +291,7 @@ TEST(OpScaledDotProductAttentionTest, CorrectnessTest_18) {
        59.25,   48.125,  40.875,  -43.25,  -65.625, -27.25,  58.0,    91.125,
        78.625,  45.875,  76.0,    79.375,  17.0,    9.75,    -26.75,  15.0,
        -1.0,    -84.75,  -38.5,   -50.625, -68.375, 0.375,   -47.0,   -91.75});
-  exec_aten::Tensor value = tfFloat.make(
+  executorch::aten::Tensor value = tfFloat.make(
       {3, 2, 4, 6},
       {-39.25,  69.875, -28.125, 18.375,  89.375,  -39.5,   -55.25,  -42.0,
        -7.875,  26.625, -6.125,  -98.25,  48.625,  33.625,  48.0,    96.75,
@@ -311,11 +311,12 @@ TEST(OpScaledDotProductAttentionTest, CorrectnessTest_18) {
        -27.875, 59.5,   15.5,    -90.0,   39.5,    -15.75,  -16.375, -96.875,
        -96.125, -47.0,  0.75,    -45.875, 74.625,  46.0,    20.5,    -42.875,
        -55.0,   30.375, -27.375, 99.375,  18.375,  0.375,   54.25,   -57.75});
-  exec_aten::optional<exec_aten::Tensor> attn_mask;
+  executorch::aten::optional<executorch::aten::Tensor> attn_mask;
   double dropout_p = 0.0;
   bool is_causal = false;
-  exec_aten::optional<double> scale = exec_aten::optional<double>(-INFINITY);
-  exec_aten::Tensor ret_expected = tfFloat.make(
+  executorch::aten::optional<double> scale =
+      executorch::aten::optional<double>(-INFINITY);
+  executorch::aten::Tensor ret_expected = tfFloat.make(
       {3, 2, 2, 6},
       {NAN, NAN, NAN, NAN, NAN, NAN, NAN, NAN, NAN, NAN, NAN, NAN,
        NAN, NAN, NAN, NAN, NAN, NAN, NAN, NAN, NAN, NAN, NAN, NAN,
@@ -324,8 +325,8 @@ TEST(OpScaledDotProductAttentionTest, CorrectnessTest_18) {
        NAN, NAN, NAN, NAN, NAN, NAN, NAN, NAN, NAN, NAN, NAN, NAN,
        NAN, NAN, NAN, NAN, NAN, NAN, NAN, NAN, NAN, NAN, NAN, NAN});
   std::vector<int32_t> out_size(query.sizes().begin(), query.sizes().end());
-  exec_aten::Tensor out = tfFloat.zeros(out_size);
-  exec_aten::Tensor ret = op_scaled_dot_product_attention(
+  executorch::aten::Tensor out = tfFloat.zeros(out_size);
+  executorch::aten::Tensor ret = op_scaled_dot_product_attention(
       query, key, value, attn_mask, dropout_p, is_causal, scale, out);
   EXPECT_TENSOR_CLOSE(ret, ret_expected);
 }
@@ -334,9 +335,9 @@ TEST(OpScaledDotProductAttentionTest, CorrectnessTest_18) {
 // Disabling this test because right now we are enforcing that
 // attention mask must be 2D
 TEST(OpScaledDotProductAttentionTest, CorrectnessTest_19) {
-  TensorFactory<exec_aten::ScalarType::Float> tfFloat;
+  TensorFactory<executorch::aten::ScalarType::Float> tfFloat;
 
-  exec_aten::Tensor query = tfFloat.make(
+  executorch::aten::Tensor query = tfFloat.make(
       {3, 2, 2, 6},
       {-50.875, 17.375,  -42.875, 8.125,   -59.625, -59.125, 0.0,     -76.375,
        39.625,  -27.75,  -43.375, 71.0,    -96.5,   -48.75,  23.125,  11.125,
@@ -347,7 +348,7 @@ TEST(OpScaledDotProductAttentionTest, CorrectnessTest_19) {
        -47.625, 86.875,  -27.125, 26.75,   41.0,    48.0,    4.375,   10.125,
        -26.375, 4.25,    56.5,    -45.625, -78.75,  99.625,  -5.5,    -85.0,
        18.125,  -71.5,   6.0,     -44.125, 59.125,  49.25,   21.125,  -6.5});
-  exec_aten::Tensor key = tfFloat.make(
+  executorch::aten::Tensor key = tfFloat.make(
       {3, 2, 4, 6},
       {-36.25,  -6.125,  49.0,    -14.375, 22.25,   17.75,   69.125,  22.625,
        -0.125,  -85.875, -71.125, -1.375,  -43.75,  -55.25,  71.125,  58.375,
@@ -367,7 +368,7 @@ TEST(OpScaledDotProductAttentionTest, CorrectnessTest_19) {
        68.75,   -18.875, 74.75,   -45.625, -15.875, 13.5,    51.25,   37.25,
        -12.0,   -15.5,   -45.75,  7.375,   1.25,    -54.375, 80.25,   18.875,
        89.0,    -30.625, -39.5,   -39.0,   46.625,  -46.0,   -87.125, -18.0});
-  exec_aten::Tensor value = tfFloat.make(
+  executorch::aten::Tensor value = tfFloat.make(
       {3, 2, 4, 6},
       {-74.5,   -0.25,   -77.125, -74.375, -53.0,   33.625,  -45.0,   66.0,
        -66.875, -71.875, -9.75,   -41.125, 37.0,    -65.25,  -50.25,  84.75,
@@ -387,8 +388,8 @@ TEST(OpScaledDotProductAttentionTest, CorrectnessTest_19) {
        15.25,   53.75,   44.625,  -22.0,   -84.0,   -7.25,   22.0,    25.875,
        17.625,  -86.875, 22.75,   -74.0,   -79.875, -68.0,   -71.125, -81.625,
        -4.125,  65.875,  1.875,   76.125,  -43.75,  -15.25,  -4.625,  -66.125});
-  exec_aten::optional<exec_aten::Tensor> attn_mask =
-      exec_aten::optional<exec_aten::Tensor>(tfFloat.make(
+  executorch::aten::optional<executorch::aten::Tensor> attn_mask =
+      executorch::aten::optional<executorch::aten::Tensor>(tfFloat.make(
           {3, 1, 2, 2, 4},
           {39.0,  49.375,  -87.125, -99.125, 49.375,  -41.125, 26.25,   79.75,
            91.0,  -3.125,  65.75,   63.5,    -48.375, 43.375,  22.5,    -53.625,
@@ -398,8 +399,8 @@ TEST(OpScaledDotProductAttentionTest, CorrectnessTest_19) {
            2.25,  81.375,  -87.125, 35.125,  -39.125, 43.5,    52.875,  39.5}));
   double dropout_p = 0.0;
   bool is_causal = false;
-  exec_aten::optional<double> scale;
-  exec_aten::Tensor ret_expected = tfFloat.make(
+  executorch::aten::optional<double> scale;
+  executorch::aten::Tensor ret_expected = tfFloat.make(
       {3, 1, 2, 2, 6},
       {37.0,
        -65.25,
@@ -480,36 +481,36 @@ TEST(OpScaledDotProductAttentionTest, CorrectnessTest_19) {
 */
 
 TEST(OpScaledDotProductAttentionTest, CorrectnessTest_51) {
-  TensorFactory<exec_aten::ScalarType::Float> tfFloat;
+  TensorFactory<executorch::aten::ScalarType::Float> tfFloat;
 
-  exec_aten::Tensor query = tfFloat.make(
+  executorch::aten::Tensor query = tfFloat.make(
       {1, 1, 8, 3},
       {-14.0,  46.125, -78.125, -61.375, 52.375, -9.125, 57.875, 88.25,
        -95.75, 8.875,  -64.625, 41.75,   -62.25, 41.25,  -67.25, 51.25,
        48.0,   67.625, 30.0,    -59.0,   42.25,  -33.0,  -10.25, -77.5});
-  exec_aten::Tensor key = tfFloat.make(
+  executorch::aten::Tensor key = tfFloat.make(
       {1, 1, 3, 3},
       {6.0, 58.5, -37.875, -11.125, -18.5, 35.0, 59.25, 73.0, 34.125});
-  exec_aten::Tensor value = tfFloat.make(
+  executorch::aten::Tensor value = tfFloat.make(
       {1, 1, 3, 3},
       {70.375, 30.875, 72.125, 53.0, 39.125, -4.625, 26.5, 79.5, 88.625});
-  exec_aten::optional<exec_aten::Tensor> attn_mask =
-      exec_aten::optional<exec_aten::Tensor>(tfFloat.make(
+  executorch::aten::optional<executorch::aten::Tensor> attn_mask =
+      executorch::aten::optional<executorch::aten::Tensor>(tfFloat.make(
           {8, 3},
           {-59.25, -26.25, -3.0,  -24.125, 47.75,  92.375,  87.5,    21.5,
            64.5,   45.0,   -54.0, 17.375,  -67.75, 14.625,  88.75,   36.0,
            88.375, 25.75,  42.5,  -13.375, -82.75, -59.625, -21.125, 6.5}));
   double dropout_p = 0.0;
   bool is_causal = false;
-  exec_aten::optional<double> scale;
-  exec_aten::Tensor ret_expected = tfFloat.make(
+  executorch::aten::optional<double> scale;
+  executorch::aten::Tensor ret_expected = tfFloat.make(
       {1, 1, 8, 3},
       {70.375, 30.875, 72.125, 70.375, 30.875, 72.125, 70.375, 30.875,
        72.125, 53.0,   39.125, -4.625, 70.375, 30.875, 72.125, 26.5,
        79.5,   88.625, 53.0,   39.125, -4.625, 70.375, 30.875, 72.125});
   std::vector<int32_t> out_size(query.sizes().begin(), query.sizes().end());
-  exec_aten::Tensor out = tfFloat.zeros(out_size);
-  exec_aten::Tensor ret = op_scaled_dot_product_attention(
+  executorch::aten::Tensor out = tfFloat.zeros(out_size);
+  executorch::aten::Tensor ret = op_scaled_dot_product_attention(
       query, key, value, attn_mask, dropout_p, is_causal, scale, out);
   EXPECT_TENSOR_CLOSE(ret, ret_expected);
 }
diff --git a/extension/llm/custom_ops/op_sdpa_with_kv_cache_test.cpp b/extension/llm/custom_ops/op_sdpa_with_kv_cache_test.cpp
index e53ddb9766..435cf44e66 100644
--- a/extension/llm/custom_ops/op_sdpa_with_kv_cache_test.cpp
+++ b/extension/llm/custom_ops/op_sdpa_with_kv_cache_test.cpp
@@ -18,19 +18,19 @@
 using namespace ::testing;
 using executorch::runtime::testing::TensorFactory;
 
-exec_aten::Tensor op_sdpa_with_kv_cache(
-    const exec_aten::Tensor& query,
-    const exec_aten::Tensor& key,
-    const exec_aten::Tensor& value,
-    exec_aten::Tensor& key_cache,
-    exec_aten::Tensor& value_cache,
+executorch::aten::Tensor op_sdpa_with_kv_cache(
+    const executorch::aten::Tensor& query,
+    const executorch::aten::Tensor& key,
+    const executorch::aten::Tensor& value,
+    executorch::aten::Tensor& key_cache,
+    executorch::aten::Tensor& value_cache,
     const int64_t start_pos,
     const int64_t seq_len,
-    const exec_aten::optional<exec_aten::Tensor>& attn_mask,
+    const executorch::aten::optional<executorch::aten::Tensor>& attn_mask,
     double dropout_p,
     bool is_causal,
-    exec_aten::optional<double> scale,
-    exec_aten::Tensor& out) {
+    executorch::aten::optional<double> scale,
+    executorch::aten::Tensor& out) {
   executorch::runtime::KernelRuntimeContext context{};
   return torch::executor::native::sdpa_with_kv_cache_out(
       context,
@@ -80,9 +80,9 @@ Missing tests:
 5. Different dtypes, fp16, bf16, double (or expect throw)
 */
 TEST(OpScaledDotProductAttentionTest, BasicTest) {
-  TensorFactory<exec_aten::ScalarType::Float> tfFloat;
+  TensorFactory<executorch::aten::ScalarType::Float> tfFloat;
 
-  exec_aten::Tensor query = tfFloat.make(
+  executorch::aten::Tensor query = tfFloat.make(
       {1, 1, 4, 4},
       {0.8823,
        0.9150,
@@ -100,7 +100,7 @@ TEST(OpScaledDotProductAttentionTest, BasicTest) {
        0.5677,
        0.7411,
        0.4294});
-  exec_aten::Tensor key = tfFloat.make(
+  executorch::aten::Tensor key = tfFloat.make(
       {1, 1, 4, 4},
       {0.8854,
        0.5739,
@@ -118,7 +118,7 @@ TEST(OpScaledDotProductAttentionTest, BasicTest) {
        0.0062,
        0.9516,
        0.0753});
-  exec_aten::Tensor value = tfFloat.make(
+  executorch::aten::Tensor value = tfFloat.make(
       {1, 1, 4, 4},
       {0.8860,
        0.5832,
@@ -136,19 +136,19 @@ TEST(OpScaledDotProductAttentionTest, BasicTest) {
        0.2814,
        0.7886,
        0.5895});
-  exec_aten::Tensor key_cache_0 = tfFloat.zeros({1, 5, 4, 4});
-  exec_aten::Tensor value_cache_0 = tfFloat.zeros({1, 5, 4, 4});
-  exec_aten::Tensor key_cache_1 = tfFloat.zeros({1, 5, 4, 4});
-  exec_aten::Tensor value_cache_1 = tfFloat.zeros({1, 5, 4, 4});
-  exec_aten::Tensor key_cache_2 = tfFloat.zeros({1, 5, 4, 4});
-  exec_aten::Tensor value_cache_2 = tfFloat.zeros({1, 5, 4, 4});
-  exec_aten::optional<exec_aten::Tensor> attn_mask;
+  executorch::aten::Tensor key_cache_0 = tfFloat.zeros({1, 5, 4, 4});
+  executorch::aten::Tensor value_cache_0 = tfFloat.zeros({1, 5, 4, 4});
+  executorch::aten::Tensor key_cache_1 = tfFloat.zeros({1, 5, 4, 4});
+  executorch::aten::Tensor value_cache_1 = tfFloat.zeros({1, 5, 4, 4});
+  executorch::aten::Tensor key_cache_2 = tfFloat.zeros({1, 5, 4, 4});
+  executorch::aten::Tensor value_cache_2 = tfFloat.zeros({1, 5, 4, 4});
+  executorch::aten::optional<executorch::aten::Tensor> attn_mask;
   double dropout_p = 0;
   bool is_causal = false;
-  exec_aten::optional<double> scale;
+  executorch::aten::optional<double> scale;
 
   // start pos: 0 layer id 0
-  exec_aten::Tensor ret_expected_0 = tfFloat.make(
+  executorch::aten::Tensor ret_expected_0 = tfFloat.make(
       {1, 1, 4, 4},
       {0.8860,
        0.5832,
@@ -168,8 +168,8 @@ TEST(OpScaledDotProductAttentionTest, BasicTest) {
        0.5895});
 
   std::vector<int32_t> out_size = {1, 1, 4, 4};
-  exec_aten::Tensor out = tfFloat.zeros(out_size);
-  exec_aten::Tensor ret = op_sdpa_with_kv_cache(
+  executorch::aten::Tensor out = tfFloat.zeros(out_size);
+  executorch::aten::Tensor ret = op_sdpa_with_kv_cache(
       query,
       key,
       value,
@@ -185,7 +185,7 @@ TEST(OpScaledDotProductAttentionTest, BasicTest) {
   EXPECT_TENSOR_CLOSE_WITH_TOL(ret, ret_expected_0, 1e-4, 1e-4);
 
   // start pos: 0 layer id 2
-  exec_aten::Tensor ret_expected_1 = tfFloat.make(
+  executorch::aten::Tensor ret_expected_1 = tfFloat.make(
       {1, 1, 4, 4},
       {0.8860,
        0.5832,
@@ -220,7 +220,7 @@ TEST(OpScaledDotProductAttentionTest, BasicTest) {
   EXPECT_TENSOR_CLOSE_WITH_TOL(ret, ret_expected_1, 1e-4, 1e-4);
 
   // start pos: 1 layer id 0
-  exec_aten::Tensor ret_expected_2 = tfFloat.make(
+  executorch::aten::Tensor ret_expected_2 = tfFloat.make(
       {1, 1, 4, 4},
       {0.8860,
        0.5832,
@@ -255,7 +255,7 @@ TEST(OpScaledDotProductAttentionTest, BasicTest) {
   EXPECT_TENSOR_CLOSE_WITH_TOL(ret, ret_expected_2, 1e-4, 1e-4);
 
   // start pos: 1 layer id 1
-  exec_aten::Tensor ret_expected_3 = tfFloat.make(
+  executorch::aten::Tensor ret_expected_3 = tfFloat.make(
       {1, 1, 4, 4},
       {0.6486,
        0.4270,
@@ -290,7 +290,7 @@ TEST(OpScaledDotProductAttentionTest, BasicTest) {
   EXPECT_TENSOR_CLOSE_WITH_TOL(ret, ret_expected_3, 1e-4, 1e-4);
 
   // start pos: 2 layer id 1
-  exec_aten::Tensor ret_expected_4 = tfFloat.make(
+  executorch::aten::Tensor ret_expected_4 = tfFloat.make(
       {1, 1, 4, 4},
       {0.7490,
        0.4930,
@@ -325,7 +325,7 @@ TEST(OpScaledDotProductAttentionTest, BasicTest) {
   EXPECT_TENSOR_CLOSE_WITH_TOL(ret, ret_expected_4, 1e-4, 1e-4);
 
   // start pos: 2 layer id 2
-  exec_aten::Tensor ret_expected_5 = tfFloat.make(
+  executorch::aten::Tensor ret_expected_5 = tfFloat.make(
       {1, 1, 4, 4},
       {0.7490,
        0.4930,
@@ -361,44 +361,44 @@ TEST(OpScaledDotProductAttentionTest, BasicTest) {
 }
 
 TEST(OpScaledDotProductAttentionTest, LargerTest) {
-  TensorFactory<exec_aten::ScalarType::Float> tfFloat;
+  TensorFactory<executorch::aten::ScalarType::Float> tfFloat;
 
-  exec_aten::Tensor query = tfFloat.make(
+  executorch::aten::Tensor query = tfFloat.make(
       {1, 1, 7, 4}, {0.8823, 0.9150, 0.3829, 0.9593, 0.3904, 0.6009, 0.2566,
                      0.7936, 0.9408, 0.1332, 0.9346, 0.5936, 0.8694, 0.5677,
                      0.7411, 0.4294, 0.8854, 0.5739, 0.2666, 0.6274, 0.2696,
                      0.4414, 0.2969, 0.8317, 0.1053, 0.2695, 0.3588, 0.1994});
-  exec_aten::Tensor key = tfFloat.make(
+  executorch::aten::Tensor key = tfFloat.make(
       {1, 1, 7, 4}, {0.5472, 0.0062, 0.9516, 0.0753, 0.8860, 0.5832, 0.3376,
                      0.8090, 0.5779, 0.9040, 0.5547, 0.3423, 0.6343, 0.3644,
                      0.7104, 0.9464, 0.7890, 0.2814, 0.7886, 0.5895, 0.7539,
                      0.1952, 0.0050, 0.3068, 0.1165, 0.9103, 0.6440, 0.7071});
-  exec_aten::Tensor value = tfFloat.make(
+  executorch::aten::Tensor value = tfFloat.make(
       {1, 1, 7, 4}, {0.6581, 0.4913, 0.8913, 0.1447, 0.5315, 0.1587, 0.6542,
                      0.3278, 0.6532, 0.3958, 0.9147, 0.2036, 0.2018, 0.2018,
                      0.9497, 0.6666, 0.9811, 0.0874, 0.0041, 0.1088, 0.1637,
                      0.7025, 0.6790, 0.9155, 0.2418, 0.1591, 0.7653, 0.2979});
-  exec_aten::Tensor key_cache_0 = tfFloat.zeros({1, 8, 7, 4});
-  exec_aten::Tensor value_cache_0 = tfFloat.zeros({1, 8, 7, 4});
-  exec_aten::Tensor key_cache_1 = tfFloat.zeros({1, 8, 7, 4});
-  exec_aten::Tensor value_cache_1 = tfFloat.zeros({1, 8, 7, 4});
-  exec_aten::Tensor key_cache_2 = tfFloat.zeros({1, 8, 7, 4});
-  exec_aten::Tensor value_cache_2 = tfFloat.zeros({1, 8, 7, 4});
-  exec_aten::optional<exec_aten::Tensor> attn_mask;
+  executorch::aten::Tensor key_cache_0 = tfFloat.zeros({1, 8, 7, 4});
+  executorch::aten::Tensor value_cache_0 = tfFloat.zeros({1, 8, 7, 4});
+  executorch::aten::Tensor key_cache_1 = tfFloat.zeros({1, 8, 7, 4});
+  executorch::aten::Tensor value_cache_1 = tfFloat.zeros({1, 8, 7, 4});
+  executorch::aten::Tensor key_cache_2 = tfFloat.zeros({1, 8, 7, 4});
+  executorch::aten::Tensor value_cache_2 = tfFloat.zeros({1, 8, 7, 4});
+  executorch::aten::optional<executorch::aten::Tensor> attn_mask;
   double dropout_p = 0;
   bool is_causal = false;
-  exec_aten::optional<double> scale;
+  executorch::aten::optional<double> scale;
 
   // start pos: 0 layer id 0
-  exec_aten::Tensor ret_expected_0 = tfFloat.make(
+  executorch::aten::Tensor ret_expected_0 = tfFloat.make(
       {1, 1, 7, 4}, {0.6581, 0.4913, 0.8913, 0.1447, 0.5315, 0.1587, 0.6542,
                      0.3278, 0.6532, 0.3958, 0.9147, 0.2036, 0.2018, 0.2018,
                      0.9497, 0.6666, 0.9811, 0.0874, 0.0041, 0.1088, 0.1637,
                      0.7025, 0.6790, 0.9155, 0.2418, 0.1591, 0.7653, 0.2979});
 
   std::vector<int32_t> out_size = {1, 1, 7, 4};
-  exec_aten::Tensor out = tfFloat.zeros(out_size);
-  exec_aten::Tensor ret = op_sdpa_with_kv_cache(
+  executorch::aten::Tensor out = tfFloat.zeros(out_size);
+  executorch::aten::Tensor ret = op_sdpa_with_kv_cache(
       query,
       key,
       value,
@@ -414,7 +414,7 @@ TEST(OpScaledDotProductAttentionTest, LargerTest) {
   EXPECT_TENSOR_CLOSE_WITH_TOL(ret, ret_expected_0, 1e-4, 1e-4);
 
   // start pos: 0 layer id 2
-  exec_aten::Tensor ret_expected_1 = tfFloat.make(
+  executorch::aten::Tensor ret_expected_1 = tfFloat.make(
       {1, 1, 7, 4}, {0.6581, 0.4913, 0.8913, 0.1447, 0.5315, 0.1587, 0.6542,
                      0.3278, 0.6532, 0.3958, 0.9147, 0.2036, 0.2018, 0.2018,
                      0.9497, 0.6666, 0.9811, 0.0874, 0.0041, 0.1088, 0.1637,
@@ -436,7 +436,7 @@ TEST(OpScaledDotProductAttentionTest, LargerTest) {
   EXPECT_TENSOR_CLOSE_WITH_TOL(ret, ret_expected_1, 1e-4, 1e-4);
 
   // start pos: 1 layer id 0
-  exec_aten::Tensor ret_expected_2 = tfFloat.make(
+  executorch::aten::Tensor ret_expected_2 = tfFloat.make(
       {1, 1, 7, 4}, {0.6581, 0.4913, 0.8913, 0.1447, 0.5315, 0.1587, 0.6542,
                      0.3278, 0.6532, 0.3958, 0.9147, 0.2036, 0.2018, 0.2018,
                      0.9497, 0.6666, 0.9811, 0.0874, 0.0041, 0.1088, 0.1637,
@@ -458,7 +458,7 @@ TEST(OpScaledDotProductAttentionTest, LargerTest) {
   EXPECT_TENSOR_CLOSE_WITH_TOL(ret, ret_expected_2, 1e-4, 1e-4);
 
   // start pos: 1 layer id 1
-  exec_aten::Tensor ret_expected_3 = tfFloat.make(
+  executorch::aten::Tensor ret_expected_3 = tfFloat.make(
       {1, 1, 7, 4}, {0.4038, 0.3015, 0.5469, 0.0888, 0.3566, 0.1065, 0.4389,
                      0.2199, 0.4354, 0.2639, 0.6097, 0.1358, 0.1412, 0.1412,
                      0.6645, 0.4664, 0.6599, 0.0588, 0.0027, 0.0732, 0.0929,
@@ -480,7 +480,7 @@ TEST(OpScaledDotProductAttentionTest, LargerTest) {
   EXPECT_TENSOR_CLOSE_WITH_TOL(ret, ret_expected_3, 1e-4, 1e-4);
 
   // start pos: 2 layer id 1
-  exec_aten::Tensor ret_expected_4 = tfFloat.make(
+  executorch::aten::Tensor ret_expected_4 = tfFloat.make(
       {1, 1, 7, 4}, {0.5005, 0.3737, 0.6779, 0.1101, 0.4268, 0.1275, 0.5254,
                      0.2633, 0.5225, 0.3166, 0.7317, 0.1629, 0.1661, 0.1661,
                      0.7819, 0.5488, 0.7891, 0.0703, 0.0033, 0.0875, 0.1185,
@@ -502,7 +502,7 @@ TEST(OpScaledDotProductAttentionTest, LargerTest) {
   EXPECT_TENSOR_CLOSE_WITH_TOL(ret, ret_expected_4, 1e-4, 1e-4);
 
   // start pos: 2 layer id 2
-  exec_aten::Tensor ret_expected_5 = tfFloat.make(
+  executorch::aten::Tensor ret_expected_5 = tfFloat.make(
       {1, 1, 7, 4}, {0.5005, 0.3737, 0.6779, 0.1101, 0.4268, 0.1275, 0.5254,
                      0.2633, 0.5225, 0.3166, 0.7317, 0.1629, 0.1661, 0.1661,
                      0.7819, 0.5488, 0.7891, 0.0703, 0.0033, 0.0875, 0.1185,
@@ -525,9 +525,9 @@ TEST(OpScaledDotProductAttentionTest, LargerTest) {
 }
 
 TEST(OpScaledDotProductAttentionTest, BasicTestWithAttnMask) {
-  TensorFactory<exec_aten::ScalarType::Float> tfFloat;
+  TensorFactory<executorch::aten::ScalarType::Float> tfFloat;
 
-  exec_aten::Tensor query = tfFloat.make(
+  executorch::aten::Tensor query = tfFloat.make(
       {1, 1, 4, 4},
       {0.8823,
        0.9150,
@@ -545,7 +545,7 @@ TEST(OpScaledDotProductAttentionTest, BasicTestWithAttnMask) {
        0.5677,
        0.7411,
        0.4294});
-  exec_aten::Tensor key = tfFloat.make(
+  executorch::aten::Tensor key = tfFloat.make(
       {1, 1, 4, 4},
       {0.8854,
        0.5739,
@@ -563,7 +563,7 @@ TEST(OpScaledDotProductAttentionTest, BasicTestWithAttnMask) {
        0.0062,
        0.9516,
        0.0753});
-  exec_aten::Tensor value = tfFloat.make(
+  executorch::aten::Tensor value = tfFloat.make(
       {1, 1, 4, 4},
       {0.8860,
        0.5832,
@@ -581,19 +581,19 @@ TEST(OpScaledDotProductAttentionTest, BasicTestWithAttnMask) {
        0.2814,
        0.7886,
        0.5895});
-  exec_aten::Tensor attn_mask = tfFloat.make({1, 1}, {0});
-  exec_aten::Tensor key_cache_0 = tfFloat.zeros({1, 5, 4, 4});
-  exec_aten::Tensor value_cache_0 = tfFloat.zeros({1, 5, 4, 4});
-  exec_aten::Tensor key_cache_1 = tfFloat.zeros({1, 5, 4, 4});
-  exec_aten::Tensor value_cache_1 = tfFloat.zeros({1, 5, 4, 4});
-  exec_aten::Tensor key_cache_2 = tfFloat.zeros({1, 5, 4, 4});
-  exec_aten::Tensor value_cache_2 = tfFloat.zeros({1, 5, 4, 4});
+  executorch::aten::Tensor attn_mask = tfFloat.make({1, 1}, {0});
+  executorch::aten::Tensor key_cache_0 = tfFloat.zeros({1, 5, 4, 4});
+  executorch::aten::Tensor value_cache_0 = tfFloat.zeros({1, 5, 4, 4});
+  executorch::aten::Tensor key_cache_1 = tfFloat.zeros({1, 5, 4, 4});
+  executorch::aten::Tensor value_cache_1 = tfFloat.zeros({1, 5, 4, 4});
+  executorch::aten::Tensor key_cache_2 = tfFloat.zeros({1, 5, 4, 4});
+  executorch::aten::Tensor value_cache_2 = tfFloat.zeros({1, 5, 4, 4});
   double dropout_p = 0;
   bool is_causal = false;
-  exec_aten::optional<double> scale;
+  executorch::aten::optional<double> scale;
 
   // start pos: 0 layer id 0
-  exec_aten::Tensor ret_expected_0 = tfFloat.make(
+  executorch::aten::Tensor ret_expected_0 = tfFloat.make(
       {1, 1, 4, 4},
       {0.8860,
        0.5832,
@@ -613,8 +613,8 @@ TEST(OpScaledDotProductAttentionTest, BasicTestWithAttnMask) {
        0.5895});
 
   std::vector<int32_t> out_size = {1, 1, 4, 4};
-  exec_aten::Tensor out = tfFloat.zeros(out_size);
-  exec_aten::Tensor ret = op_sdpa_with_kv_cache(
+  executorch::aten::Tensor out = tfFloat.zeros(out_size);
+  executorch::aten::Tensor ret = op_sdpa_with_kv_cache(
       query,
       key,
       value,
@@ -630,7 +630,7 @@ TEST(OpScaledDotProductAttentionTest, BasicTestWithAttnMask) {
   EXPECT_TENSOR_CLOSE_WITH_TOL(ret, ret_expected_0, 1e-4, 1e-4);
 
   // start pos: 0 layer id 2
-  exec_aten::Tensor ret_expected_1 = tfFloat.make(
+  executorch::aten::Tensor ret_expected_1 = tfFloat.make(
       {1, 1, 4, 4},
       {0.8860,
        0.5832,
@@ -666,7 +666,7 @@ TEST(OpScaledDotProductAttentionTest, BasicTestWithAttnMask) {
 
   attn_mask = tfFloat.make({1, 2}, {0, 0});
   // start pos: 1 layer id 0
-  exec_aten::Tensor ret_expected_2 = tfFloat.make(
+  executorch::aten::Tensor ret_expected_2 = tfFloat.make(
       {1, 1, 4, 4},
       {0.8860,
        0.5832,
@@ -701,7 +701,7 @@ TEST(OpScaledDotProductAttentionTest, BasicTestWithAttnMask) {
   EXPECT_TENSOR_CLOSE_WITH_TOL(ret, ret_expected_2, 1e-4, 1e-4);
 
   // start pos: 1 layer id 1
-  exec_aten::Tensor ret_expected_3 = tfFloat.make(
+  executorch::aten::Tensor ret_expected_3 = tfFloat.make(
       {1, 1, 4, 4},
       {0.6486,
        0.4270,
@@ -737,7 +737,7 @@ TEST(OpScaledDotProductAttentionTest, BasicTestWithAttnMask) {
 
   attn_mask = tfFloat.make({1, 3}, {0, 0, 0});
   // start pos: 2 layer id 1
-  exec_aten::Tensor ret_expected_4 = tfFloat.make(
+  executorch::aten::Tensor ret_expected_4 = tfFloat.make(
       {1, 1, 4, 4},
       {0.7490,
        0.4930,
@@ -772,7 +772,7 @@ TEST(OpScaledDotProductAttentionTest, BasicTestWithAttnMask) {
   EXPECT_TENSOR_CLOSE_WITH_TOL(ret, ret_expected_4, 1e-4, 1e-4);
 
   // start pos: 2 layer id 2
-  exec_aten::Tensor ret_expected_5 = tfFloat.make(
+  executorch::aten::Tensor ret_expected_5 = tfFloat.make(
       {1, 1, 4, 4},
       {0.7490,
        0.4930,
@@ -808,37 +808,37 @@ TEST(OpScaledDotProductAttentionTest, BasicTestWithAttnMask) {
 }
 
 TEST(OpScaledDotProductAttentionTest, SequenceTest) {
-  TensorFactory<exec_aten::ScalarType::Float> tfFloat;
+  TensorFactory<executorch::aten::ScalarType::Float> tfFloat;
 
-  exec_aten::Tensor query = tfFloat.make(
+  executorch::aten::Tensor query = tfFloat.make(
       {1, 1, 8, 4},
       {0.1261, 0.5031, 0.1117, 0.3905, 0.3625, 0.9328, 0.6549, 0.4128,
        0.5845, 0.3557, 0.6965, 0.6978, 0.6343, 0.3051, 0.9266, 0.4278,
        0.3053, 0.8132, 0.9075, 0.9976, 0.6481, 0.3296, 0.7539, 0.9290,
        0.0096, 0.4381, 0.1590, 0.5932, 0.7068, 0.3967, 0.4582, 0.7251});
-  exec_aten::Tensor key = tfFloat.make(
+  executorch::aten::Tensor key = tfFloat.make(
       {1, 1, 8, 4},
       {0.4160, 0.0801, 0.9001, 0.2483, 0.4451, 0.5472, 0.4700, 0.0297,
        0.7294, 0.2729, 0.2407, 0.6195, 0.2391, 0.2689, 0.3315, 0.3122,
        0.2912, 0.3652, 0.6299, 0.0954, 0.1974, 0.5073, 0.5695, 0.7761,
        0.1488, 0.6596, 0.7842, 0.7776, 0.0343, 0.3092, 0.0702, 0.1836});
-  exec_aten::Tensor value = tfFloat.make(
+  executorch::aten::Tensor value = tfFloat.make(
       {1, 1, 8, 4},
       {0.7785, 0.4253, 0.7124, 0.2065, 0.5760, 0.1976, 0.7499, 0.2813,
        0.3746, 0.0662, 0.5017, 0.9747, 0.7427, 0.2332, 0.5067, 0.4452,
        0.0975, 0.8920, 0.5081, 0.6053, 0.2981, 0.2660, 0.5824, 0.6849,
        0.6121, 0.2590, 0.9854, 0.4264, 0.1938, 0.2661, 0.9922, 0.5000});
 
-  exec_aten::Tensor key_cache_0 = tfFloat.zeros({1, 5, 8, 4});
-  exec_aten::Tensor value_cache_0 = tfFloat.zeros({1, 5, 8, 4});
+  executorch::aten::Tensor key_cache_0 = tfFloat.zeros({1, 5, 8, 4});
+  executorch::aten::Tensor value_cache_0 = tfFloat.zeros({1, 5, 8, 4});
 
-  exec_aten::optional<exec_aten::Tensor> attn_mask;
+  executorch::aten::optional<executorch::aten::Tensor> attn_mask;
   double dropout_p = 0;
   bool is_causal = false;
-  exec_aten::optional<double> scale;
+  executorch::aten::optional<double> scale;
 
   // start pos: 0 layer id 0
-  exec_aten::Tensor ret_expected_0 = tfFloat.make(
+  executorch::aten::Tensor ret_expected_0 = tfFloat.make(
       {1, 1, 8, 4},
       {0.7785, 0.4253, 0.7124, 0.2065, 0.5760, 0.1976, 0.7499, 0.2813,
        0.3746, 0.0662, 0.5017, 0.9747, 0.7427, 0.2332, 0.5067, 0.4452,
@@ -846,8 +846,8 @@ TEST(OpScaledDotProductAttentionTest, SequenceTest) {
        0.6121, 0.2590, 0.9854, 0.4264, 0.1938, 0.2661, 0.9922, 0.5000});
 
   std::vector<int32_t> out_size = {1, 1, 8, 4};
-  exec_aten::Tensor out = tfFloat.zeros(out_size);
-  exec_aten::Tensor ret = op_sdpa_with_kv_cache(
+  executorch::aten::Tensor out = tfFloat.zeros(out_size);
+  executorch::aten::Tensor ret = op_sdpa_with_kv_cache(
       query,
       key,
       value,
@@ -881,7 +881,7 @@ TEST(OpScaledDotProductAttentionTest, SequenceTest) {
        0.7504, 0.6705, 0.0189, 0.9809, 0.4145, 0.0328, 0.9936, 0.2965,
        0.4646, 0.9576, 0.1534, 0.1463, 0.5813, 0.4331, 0.6152, 0.0806,
        0.5150, 0.2776, 0.2542, 0.0422, 0.7651, 0.5963, 0.0773, 0.8968});
-  exec_aten::Tensor ret_expected_1 = tfFloat.make(
+  executorch::aten::Tensor ret_expected_1 = tfFloat.make(
       {1, 1, 8, 4},
       {0.5203, 0.2639, 0.6188, 0.2066, 0.7836, 0.0872, 0.7335, 0.7256,
        0.5940, 0.4189, 0.2199, 0.9784, 0.5461, 0.1132, 0.7983, 0.3561,
@@ -922,7 +922,7 @@ TEST(OpScaledDotProductAttentionTest, SequenceTest) {
        0.7425, 0.0729, 0.9303, 0.9842, 0.6361, 0.1863, 0.7433, 0.5852,
        0.6360, 0.6643, 0.8807, 0.2851, 0.3875, 0.6364, 0.5545, 0.9032,
        0.2374, 0.4818, 0.5934, 0.3672, 0.8409, 0.5547, 0.0379, 0.4458});
-  exec_aten::Tensor ret_expected_2 = tfFloat.make(
+  executorch::aten::Tensor ret_expected_2 = tfFloat.make(
       {1, 1, 8, 4},
       {0.6350, 0.3426, 0.6582, 0.2484, 0.4391, 0.3419, 0.6962, 0.4399,
        0.6321, 0.3475, 0.3754, 0.9798, 0.5721, 0.1344, 0.7829, 0.4233,
@@ -963,7 +963,7 @@ TEST(OpScaledDotProductAttentionTest, SequenceTest) {
        0.1608, 0.5514, 0.5479, 0.5692, 0.0784, 0.0251, 0.7301, 0.9288,
        0.0563, 0.6852, 0.1319, 0.5313, 0.9652, 0.8793, 0.1344, 0.8093,
        0.7612, 0.4992, 0.9844, 0.3014, 0.3836, 0.2473, 0.5719, 0.6324});
-  exec_aten::Tensor ret_expected_3 = tfFloat.make(
+  executorch::aten::Tensor ret_expected_3 = tfFloat.make(
       {1, 1, 8, 4},
       {0.6441, 0.3571, 0.7319, 0.2624, 0.4506, 0.3619, 0.5749, 0.4930,
        0.4860, 0.3924, 0.4596, 0.8517, 0.4312, 0.1060, 0.7579, 0.5796,
diff --git a/extension/llm/custom_ops/op_tile_crop.cpp b/extension/llm/custom_ops/op_tile_crop.cpp
index 439c6f8e6d..03777ea3e6 100644
--- a/extension/llm/custom_ops/op_tile_crop.cpp
+++ b/extension/llm/custom_ops/op_tile_crop.cpp
@@ -31,7 +31,7 @@ bool check_tile_crop_out_args(
 void get_tile_crop_out_target_size(
     const Tensor& in,
     int64_t tile_size,
-    exec_aten::SizesType* out_sizes,
+    executorch::aten::SizesType* out_sizes,
     size_t* out_ndim) {
   *out_ndim = in.dim() + 1;
 
diff --git a/extension/llm/custom_ops/op_tile_crop_aot.cpp b/extension/llm/custom_ops/op_tile_crop_aot.cpp
index 1755e543eb..8a2ee6da62 100644
--- a/extension/llm/custom_ops/op_tile_crop_aot.cpp
+++ b/extension/llm/custom_ops/op_tile_crop_aot.cpp
@@ -19,7 +19,7 @@ namespace native {
 
 Tensor&
 tile_crop_out_no_context(const Tensor& input, int64_t tile_size, Tensor& out) {
-  exec_aten::RuntimeContext context{};
+  executorch::aten::RuntimeContext context{};
   return tile_crop_out_impl(context, input, tile_size, out);
 }
 
diff --git a/extension/llm/custom_ops/op_tile_crop_test.cpp b/extension/llm/custom_ops/op_tile_crop_test.cpp
index 36841b80f1..2a653f91fa 100644
--- a/extension/llm/custom_ops/op_tile_crop_test.cpp
+++ b/extension/llm/custom_ops/op_tile_crop_test.cpp
@@ -13,8 +13,8 @@
 #include <gtest/gtest.h>
 
 using namespace ::testing;
-using exec_aten::ScalarType;
-using exec_aten::Tensor;
+using executorch::aten::ScalarType;
+using executorch::aten::Tensor;
 using executorch::runtime::testing::TensorFactory;
 
 class OpTileCropOutTest : public OperatorTest {
diff --git a/extension/llm/custom_ops/op_update_cache.cpp b/extension/llm/custom_ops/op_update_cache.cpp
index 740a0c6cd7..bbc0190dab 100644
--- a/extension/llm/custom_ops/op_update_cache.cpp
+++ b/extension/llm/custom_ops/op_update_cache.cpp
@@ -101,21 +101,21 @@ Tensor& update_cache_out(
   ET_CHECK_MSG(cache_data, "cache data is null");
 
   auto cache_strides = cache.strides();
-  exec_aten::StridesType cache_batch_dim_stride = cache_strides[0];
-  exec_aten::StridesType cache_seq_dim_stride = cache_strides[1];
+  executorch::aten::StridesType cache_batch_dim_stride = cache_strides[0];
+  executorch::aten::StridesType cache_seq_dim_stride = cache_strides[1];
 
   auto value_strides = value.strides();
-  exec_aten::StridesType value_batch_dim_stride = value_strides[0];
+  executorch::aten::StridesType value_batch_dim_stride = value_strides[0];
 
-  exec_aten::SizesType num_bytes_to_copy =
+  executorch::aten::SizesType num_bytes_to_copy =
       (value.numel() / value.size(0)) * value.element_size();
 
   for (int64_t batch_line = 0; batch_line < value.size(0); ++batch_line) {
-    exec_aten::SizesType cache_pos_offset =
+    executorch::aten::SizesType cache_pos_offset =
         (batch_line * cache_batch_dim_stride +
          start_pos * cache_seq_dim_stride) *
         cache.element_size();
-    exec_aten::SizesType value_pos_offset =
+    executorch::aten::SizesType value_pos_offset =
         (batch_line * value_batch_dim_stride) * cache.element_size();
 
     std::memcpy(
diff --git a/extension/llm/custom_ops/spinquant/test/op_fast_hadamard_transform_test.cpp b/extension/llm/custom_ops/spinquant/test/op_fast_hadamard_transform_test.cpp
index 7ab2d6c300..cbce737b92 100644
--- a/extension/llm/custom_ops/spinquant/test/op_fast_hadamard_transform_test.cpp
+++ b/extension/llm/custom_ops/spinquant/test/op_fast_hadamard_transform_test.cpp
@@ -15,7 +15,7 @@
 
 #include <cmath>
 
-using exec_aten::Tensor;
+using executorch::aten::Tensor;
 
 using executorch::runtime::testing::fast_hadamard_transform_28N_with_transpose;
 using executorch::runtime::testing::random_floats;
@@ -23,14 +23,15 @@ using executorch::runtime::testing::reference_fht_impl;
 
 namespace {
 Tensor& fast_hadamard_transform_nocontext(const Tensor& vec, Tensor& out) {
-  exec_aten::RuntimeContext context;
+  executorch::aten::RuntimeContext context;
   return torch::executor::native::fast_hadamard_transform_out(
       context, vec, out);
 }
 } // namespace
 
 TEST(OpFastHadamardTransformTest, EmptyInput) {
-  torch::executor::testing::TensorFactory<exec_aten::ScalarType::Float> tfFloat;
+  torch::executor::testing::TensorFactory<executorch::aten::ScalarType::Float>
+      tfFloat;
   auto vec = tfFloat.zeros({0});
   auto out = tfFloat.zeros({0});
   auto result = fast_hadamard_transform_nocontext(vec, out);
@@ -38,7 +39,8 @@ TEST(OpFastHadamardTransformTest, EmptyInput) {
 }
 
 TEST(OpFastHadamardTransformTest, SingleElementInput) {
-  torch::executor::testing::TensorFactory<exec_aten::ScalarType::Float> tfFloat;
+  torch::executor::testing::TensorFactory<executorch::aten::ScalarType::Float>
+      tfFloat;
   auto vec = tfFloat.ones({1});
   auto out = tfFloat.zeros({1});
   auto result = fast_hadamard_transform_nocontext(vec, out);
@@ -48,7 +50,8 @@ TEST(OpFastHadamardTransformTest, SingleElementInput) {
 }
 
 TEST(OpFastHadamardTransformTest, FourKInput) {
-  torch::executor::testing::TensorFactory<exec_aten::ScalarType::Float> tfFloat;
+  torch::executor::testing::TensorFactory<executorch::aten::ScalarType::Float>
+      tfFloat;
   std::vector<float> data = random_floats(4096);
   auto vec = tfFloat.make({4096}, data);
   auto out = tfFloat.zeros({4096});
@@ -64,7 +67,8 @@ TEST(OpFastHadamardTransformTest, FourKInput) {
 }
 
 TEST(OpFastHadamardTransformTest, MultipleRows) {
-  torch::executor::testing::TensorFactory<exec_aten::ScalarType::Float> tfFloat;
+  torch::executor::testing::TensorFactory<executorch::aten::ScalarType::Float>
+      tfFloat;
   std::vector<float> data = random_floats(8 * 8 * 8);
   auto mat = tfFloat.make({8, 8, 8}, data);
   auto out = tfFloat.zeros({8, 8, 8});
@@ -85,7 +89,8 @@ TEST(OpFastHadamardTransformTest, MultipleRows) {
 }
 
 TEST(OpFastHadamardTransformTest, Basic28N) {
-  torch::executor::testing::TensorFactory<exec_aten::ScalarType::Float> tfFloat;
+  torch::executor::testing::TensorFactory<executorch::aten::ScalarType::Float>
+      tfFloat;
   constexpr int kTestLogSize = 7;
   constexpr int kTestPowerOfTwoSize = 1 << kTestLogSize;
   constexpr int kTestTotalSize = kTestPowerOfTwoSize * 28;
@@ -108,11 +113,12 @@ TEST(OpFastHadamardTransformTest, Basic28N) {
 }
 
 TEST(OpFastHadamardTransformTest, InvalidSize) {
-  torch::executor::testing::TensorFactory<exec_aten::ScalarType::Float> tfFloat;
+  torch::executor::testing::TensorFactory<executorch::aten::ScalarType::Float>
+      tfFloat;
   auto mat = tfFloat.zeros({3});
   auto out = tfFloat.zeros({3});
 
-  exec_aten::RuntimeContext context;
+  executorch::aten::RuntimeContext context;
   torch::executor::native::fast_hadamard_transform_out(context, mat, out);
   EXPECT_NE(context.failure_state(), executorch::runtime::Error::Ok);
 }
diff --git a/extension/llm/runner/text_decoder_runner.cpp b/extension/llm/runner/text_decoder_runner.cpp
index faf4d1344e..7a546574e3 100644
--- a/extension/llm/runner/text_decoder_runner.cpp
+++ b/extension/llm/runner/text_decoder_runner.cpp
@@ -37,7 +37,7 @@ TextDecoderRunner::TextDecoderRunner(
 // This function is functional, meaning it shouldn't modify any state of the
 // input. It should be safe to call multiple times with the same inputs. The
 // outer loop (call site) is responsible for managing state.
-::executorch::runtime::Result<exec_aten::Tensor> TextDecoderRunner::step(
+::executorch::runtime::Result<executorch::aten::Tensor> TextDecoderRunner::step(
     TensorPtr& tokens,
     TensorPtr& start_pos) {
   // ET_LOG(Info, "Input token %" PRIu64, input_token);
diff --git a/extension/llm/runner/text_prefiller.cpp b/extension/llm/runner/text_prefiller.cpp
index 705583d638..473cc2a3d8 100644
--- a/extension/llm/runner/text_prefiller.cpp
+++ b/extension/llm/runner/text_prefiller.cpp
@@ -41,10 +41,10 @@ ::executorch::runtime::Result<uint64_t> TextPrefiller::prefill(
     auto tokens = from_blob(
         prompt_tokens.data(),
         {1, num_prompt_tokens},
-        exec_aten::ScalarType::Long);
+        executorch::aten::ScalarType::Long);
 
     auto start_pos_tensor =
-        from_blob(&start_pos, {1}, exec_aten::ScalarType::Long);
+        from_blob(&start_pos, {1}, executorch::aten::ScalarType::Long);
 
     auto outputs_res = text_decoder_runner_->step(tokens, start_pos_tensor);
 
@@ -60,10 +60,11 @@ ::executorch::runtime::Result<uint64_t> TextPrefiller::prefill(
     cur_token = prompt_tokens[0];
 
     // initialize tensor wrappers
-    auto tokens = from_blob(&cur_token, {1, 1}, exec_aten::ScalarType::Long);
+    auto tokens =
+        from_blob(&cur_token, {1, 1}, executorch::aten::ScalarType::Long);
 
     auto start_pos_tensor =
-        from_blob(&start_pos, {1}, exec_aten::ScalarType::Long);
+        from_blob(&start_pos, {1}, executorch::aten::ScalarType::Long);
 
     // run the first token and get back logits tensor. Assuming the first token
     // is bos so don't callback.
diff --git a/extension/llm/sampler/sampler.cpp b/extension/llm/sampler/sampler.cpp
index f7342c48f7..63e4b79d56 100644
--- a/extension/llm/sampler/sampler.cpp
+++ b/extension/llm/sampler/sampler.cpp
@@ -191,9 +191,10 @@ int32_t Sampler::sample(T* logits) {
 }
 
 template int32_t Sampler::sample<float>(float* logits);
-template int32_t Sampler::sample<exec_aten::Half>(exec_aten::Half* logits);
-template int32_t Sampler::sample<exec_aten::BFloat16>(
-    exec_aten::BFloat16* logits);
+template int32_t Sampler::sample<executorch::aten::Half>(
+    executorch::aten::Half* logits);
+template int32_t Sampler::sample<executorch::aten::BFloat16>(
+    executorch::aten::BFloat16* logits);
 
 } // namespace llm
 } // namespace extension
diff --git a/extension/module/module.cpp b/extension/module/module.cpp
index 0e708ab0b7..99cc7e38bd 100644
--- a/extension/module/module.cpp
+++ b/extension/module/module.cpp
@@ -184,8 +184,9 @@ runtime::Result<std::vector<runtime::EValue>> Module::execute(
     ET_CHECK_OR_RETURN_ERROR(
         !inputs[i].isNone(), InvalidArgument, "input %zu is none", i);
   }
-  ET_CHECK_OK_OR_RETURN_ERROR(method->set_inputs(
-      exec_aten::ArrayRef<runtime::EValue>(inputs.data(), inputs.size())));
+  ET_CHECK_OK_OR_RETURN_ERROR(
+      method->set_inputs(executorch::aten::ArrayRef<runtime::EValue>(
+          inputs.data(), inputs.size())));
   ET_CHECK_OK_OR_RETURN_ERROR(method->execute());
 
   const auto outputs_size = method->outputs_size();
diff --git a/extension/module/test/module_test.cpp b/extension/module/test/module_test.cpp
index 86b9d849a2..2dbb0fea93 100644
--- a/extension/module/test/module_test.cpp
+++ b/extension/module/test/module_test.cpp
@@ -102,13 +102,13 @@ TEST_F(ModuleTest, TestMethodMeta) {
 
   const auto input_meta = meta->input_tensor_meta(0);
   EXPECT_EQ(input_meta.error(), Error::Ok);
-  EXPECT_EQ(input_meta->scalar_type(), exec_aten::ScalarType::Float);
+  EXPECT_EQ(input_meta->scalar_type(), executorch::aten::ScalarType::Float);
   EXPECT_EQ(input_meta->sizes().size(), 1);
   EXPECT_EQ(input_meta->sizes()[0], 1);
 
   const auto output_meta = meta->output_tensor_meta(0);
   EXPECT_EQ(output_meta.error(), Error::Ok);
-  EXPECT_EQ(output_meta->scalar_type(), exec_aten::ScalarType::Float);
+  EXPECT_EQ(output_meta->scalar_type(), executorch::aten::ScalarType::Float);
   EXPECT_EQ(output_meta->sizes().size(), 1);
   EXPECT_EQ(output_meta->sizes()[0], 1);
 }
diff --git a/extension/pybindings/pybindings.cpp b/extension/pybindings/pybindings.cpp
index 518e66d284..97bff67114 100644
--- a/extension/pybindings/pybindings.cpp
+++ b/extension/pybindings/pybindings.cpp
@@ -252,7 +252,8 @@ class Module final {
       const std::optional<std::vector<Span<uint8_t>>>& output_storages =
           std::nullopt) {
     auto& method = get_method(method_name);
-    exec_aten::ArrayRef<EValue> input_evalue_list(args.data(), args.size());
+    executorch::aten::ArrayRef<EValue> input_evalue_list(
+        args.data(), args.size());
 
     Error set_inputs_status = method.set_inputs(input_evalue_list);
     THROW_IF_ERROR(
@@ -493,7 +494,8 @@ struct PyTensorInfo final {
   }
 
   int8_t dtype() const {
-    return static_cast<std::underlying_type<exec_aten::ScalarType>::type>(
+    return static_cast<
+        std::underlying_type<executorch::aten::ScalarType>::type>(
         info_.scalar_type());
   }
 
diff --git a/extension/runner_util/inputs_portable.cpp b/extension/runner_util/inputs_portable.cpp
index 0236207e1b..6f31acc31e 100644
--- a/extension/runner_util/inputs_portable.cpp
+++ b/extension/runner_util/inputs_portable.cpp
@@ -16,8 +16,8 @@
 #include <executorch/runtime/executor/method_meta.h>
 #include <executorch/runtime/platform/log.h>
 
-using exec_aten::Tensor;
-using exec_aten::TensorImpl;
+using executorch::aten::Tensor;
+using executorch::aten::TensorImpl;
 using executorch::runtime::Error;
 using executorch::runtime::Method;
 using executorch::runtime::TensorInfo;
diff --git a/extension/runner_util/test/inputs_test.cpp b/extension/runner_util/test/inputs_test.cpp
index c916da488e..829c5265d5 100644
--- a/extension/runner_util/test/inputs_test.cpp
+++ b/extension/runner_util/test/inputs_test.cpp
@@ -18,8 +18,8 @@
 #include <gtest/gtest.h>
 
 using namespace ::testing;
-using exec_aten::ScalarType;
-using exec_aten::Tensor;
+using executorch::aten::ScalarType;
+using executorch::aten::Tensor;
 using executorch::extension::BufferCleanup;
 using executorch::extension::FileDataLoader;
 using executorch::extension::prepare_input_tensors;
diff --git a/extension/tensor/tensor_ptr.cpp b/extension/tensor/tensor_ptr.cpp
index 4e660cd6f8..c1742fc599 100644
--- a/extension/tensor/tensor_ptr.cpp
+++ b/extension/tensor/tensor_ptr.cpp
@@ -25,18 +25,18 @@ namespace {
  * proper cleanup of the associated metadata and data if needed.
  */
 struct Storage final {
-  exec_aten::TensorImpl tensor_impl;
-  exec_aten::Tensor tensor;
-  std::vector<exec_aten::SizesType> sizes;
-  std::vector<exec_aten::DimOrderType> dim_order;
-  std::vector<exec_aten::StridesType> strides;
+  executorch::aten::TensorImpl tensor_impl;
+  executorch::aten::Tensor tensor;
+  std::vector<executorch::aten::SizesType> sizes;
+  std::vector<executorch::aten::DimOrderType> dim_order;
+  std::vector<executorch::aten::StridesType> strides;
   std::function<void(void*)> deleter;
 
   Storage(
-      exec_aten::TensorImpl&& tensor_impl,
-      std::vector<exec_aten::SizesType>&& sizes,
-      std::vector<exec_aten::DimOrderType>&& dim_order,
-      std::vector<exec_aten::StridesType>&& strides,
+      executorch::aten::TensorImpl&& tensor_impl,
+      std::vector<executorch::aten::SizesType>&& sizes,
+      std::vector<executorch::aten::DimOrderType>&& dim_order,
+      std::vector<executorch::aten::StridesType>&& strides,
       std::function<void(void*)>&& deleter)
       : tensor_impl(std::move(tensor_impl)),
         tensor(&this->tensor_impl),
@@ -55,12 +55,12 @@ struct Storage final {
 } // namespace
 
 TensorPtr make_tensor_ptr(
-    std::vector<exec_aten::SizesType> sizes,
+    std::vector<executorch::aten::SizesType> sizes,
     void* data,
-    std::vector<exec_aten::DimOrderType> dim_order,
-    std::vector<exec_aten::StridesType> strides,
-    exec_aten::ScalarType type,
-    exec_aten::TensorShapeDynamism dynamism,
+    std::vector<executorch::aten::DimOrderType> dim_order,
+    std::vector<executorch::aten::StridesType> strides,
+    executorch::aten::ScalarType type,
+    executorch::aten::TensorShapeDynamism dynamism,
     std::function<void(void*)> deleter) {
   const auto dim = sizes.size();
   ET_CHECK_MSG(
@@ -79,7 +79,7 @@ TensorPtr make_tensor_ptr(
       });
     }
   }
-  std::vector<exec_aten::StridesType> computed_strides(dim);
+  std::vector<executorch::aten::StridesType> computed_strides(dim);
   auto error = runtime::dim_order_to_stride(
       sizes.data(), dim_order.data(), dim, computed_strides.data());
   ET_CHECK_MSG(error == runtime::Error::Ok, "Failed to compute strides.");
@@ -90,14 +90,14 @@ TensorPtr make_tensor_ptr(
     strides = std::move(computed_strides);
   }
 #ifndef USE_ATEN_LIB
-  exec_aten::TensorImpl tensor_impl(
+  executorch::aten::TensorImpl tensor_impl(
       type,
       dim,
       sizes.data(),
       data,
       dim_order.data(),
       strides.data(),
-      dim > 0 ? dynamism : exec_aten::TensorShapeDynamism::STATIC);
+      dim > 0 ? dynamism : executorch::aten::TensorShapeDynamism::STATIC);
   auto storage = std::make_shared<Storage>(
       std::move(tensor_impl),
       std::move(sizes),
@@ -105,7 +105,8 @@ TensorPtr make_tensor_ptr(
       std::move(strides),
       std::move(deleter));
   const auto tensor_ptr = &storage->tensor;
-  return std::shared_ptr<exec_aten::Tensor>(std::move(storage), tensor_ptr);
+  return std::shared_ptr<executorch::aten::Tensor>(
+      std::move(storage), tensor_ptr);
 #else
   auto options = c10::TensorOptions()
                      .dtype(c10::scalarTypeToTypeMeta(type))
@@ -118,25 +119,26 @@ TensorPtr make_tensor_ptr(
           data, std::move(deleter), options.device()),
       nullptr,
       false);
-  auto tensor_impl = c10::make_intrusive<exec_aten::TensorImpl>(
+  auto tensor_impl = c10::make_intrusive<executorch::aten::TensorImpl>(
       std::move(storage),
       c10::DispatchKeySet(c10::DispatchKey::CPU),
       options.dtype());
   tensor_impl->set_sizes_and_strides(sizes, strides);
-  return std::make_shared<exec_aten::Tensor>(std::move(tensor_impl));
+  return std::make_shared<executorch::aten::Tensor>(std::move(tensor_impl));
 #endif // USE_ATEN_LIB
 }
 
 TensorPtr make_tensor_ptr(
-    std::vector<exec_aten::SizesType> sizes,
+    std::vector<executorch::aten::SizesType> sizes,
     std::vector<uint8_t> data,
-    std::vector<exec_aten::DimOrderType> dim_order,
-    std::vector<exec_aten::StridesType> strides,
-    exec_aten::ScalarType type,
-    exec_aten::TensorShapeDynamism dynamism) {
+    std::vector<executorch::aten::DimOrderType> dim_order,
+    std::vector<executorch::aten::StridesType> strides,
+    executorch::aten::ScalarType type,
+    executorch::aten::TensorShapeDynamism dynamism) {
   ET_CHECK_MSG(
-      data.size() >= exec_aten::compute_numel(sizes.data(), sizes.size()) *
-              exec_aten::elementSize(type),
+      data.size() >=
+          executorch::aten::compute_numel(sizes.data(), sizes.size()) *
+              executorch::aten::elementSize(type),
       "Data size is smaller than required by sizes and scalar type.");
   auto data_ptr = data.data();
   return make_tensor_ptr(
@@ -150,17 +152,17 @@ TensorPtr make_tensor_ptr(
       [data = std::move(data)](void*) {});
 }
 
-TensorPtr clone_tensor_ptr(const exec_aten::Tensor& tensor) {
-  std::vector<exec_aten::SizesType> sizes(
+TensorPtr clone_tensor_ptr(const executorch::aten::Tensor& tensor) {
+  std::vector<executorch::aten::SizesType> sizes(
       tensor.sizes().begin(), tensor.sizes().end());
-  std::vector<exec_aten::DimOrderType> dim_order{
+  std::vector<executorch::aten::DimOrderType> dim_order{
 #ifndef USE_ATEN_LIB
       tensor.dim_order().begin(), tensor.dim_order().end()
 #endif // USE_ATEN_LIB
   };
-  std::vector<exec_aten::StridesType> strides(
+  std::vector<executorch::aten::StridesType> strides(
       tensor.strides().begin(), tensor.strides().end());
-  auto dynamism = exec_aten::TensorShapeDynamism::DYNAMIC_BOUND;
+  auto dynamism = executorch::aten::TensorShapeDynamism::DYNAMIC_BOUND;
 #ifndef USE_ATEN_LIB
   dynamism = tensor.shape_dynamism();
 #endif // USE_ATEN_LIB
@@ -185,10 +187,11 @@ TensorPtr clone_tensor_ptr(const exec_aten::Tensor& tensor) {
 
 runtime::Error resize_tensor_ptr(
     TensorPtr& tensor,
-    const std::vector<exec_aten::SizesType>& sizes) {
+    const std::vector<executorch::aten::SizesType>& sizes) {
   return runtime::resize_tensor(
       *tensor,
-      exec_aten::ArrayRef<exec_aten::SizesType>(sizes.data(), sizes.size()));
+      executorch::aten::ArrayRef<executorch::aten::SizesType>(
+          sizes.data(), sizes.size()));
 }
 
 } // namespace extension
diff --git a/extension/tensor/tensor_ptr_maker.cpp b/extension/tensor/tensor_ptr_maker.cpp
index cbea6da1e7..ce9de03444 100644
--- a/extension/tensor/tensor_ptr_maker.cpp
+++ b/extension/tensor/tensor_ptr_maker.cpp
@@ -19,7 +19,7 @@ template <
     typename std::enable_if<
         std::is_integral<INT_T>::value && !std::is_same<INT_T, bool>::value,
         bool>::type = true>
-bool extract_scalar(exec_aten::Scalar scalar, INT_T* out_val) {
+bool extract_scalar(executorch::aten::Scalar scalar, INT_T* out_val) {
   if (!scalar.isIntegral(/*includeBool=*/false)) {
     return false;
   }
@@ -34,9 +34,12 @@ bool extract_scalar(exec_aten::Scalar scalar, INT_T* out_val) {
 
 template <
     typename FLOAT_T,
-    typename std::enable_if<std::is_floating_point<FLOAT_T>::value, bool>::
-        type = true>
-bool extract_scalar(exec_aten::Scalar scalar, FLOAT_T* out_val) {
+    typename std::enable_if<
+        std::is_floating_point_v<FLOAT_T> ||
+            std::is_same_v<FLOAT_T, executorch::aten::BFloat16> ||
+            std::is_same_v<FLOAT_T, executorch::aten::Half>,
+        bool>::type = true>
+bool extract_scalar(executorch::aten::Scalar scalar, FLOAT_T* out_val) {
   double val;
   if (scalar.isFloatingPoint()) {
     val = scalar.to<double>();
@@ -58,8 +61,8 @@ template <
     typename BOOL_T,
     typename std::enable_if<std::is_same<BOOL_T, bool>::value, bool>::type =
         true>
-bool extract_scalar(exec_aten::Scalar scalar, BOOL_T* out_val) {
-  if (scalar.isIntegral(false)) {
+bool extract_scalar(executorch::aten::Scalar scalar, BOOL_T* out_val) {
+  if (scalar.isIntegral(/*includeBool=*/false)) {
     *out_val = static_cast<bool>(scalar.to<int64_t>());
     return true;
   }
@@ -77,16 +80,16 @@ bool extract_scalar(exec_aten::Scalar scalar, BOOL_T* out_val) {
 
 template <typename Distribution>
 TensorPtr random_strided(
-    std::vector<exec_aten::SizesType> sizes,
-    std::vector<exec_aten::StridesType> strides,
-    exec_aten::ScalarType type,
-    exec_aten::TensorShapeDynamism dynamism,
+    std::vector<executorch::aten::SizesType> sizes,
+    std::vector<executorch::aten::StridesType> strides,
+    executorch::aten::ScalarType type,
+    executorch::aten::TensorShapeDynamism dynamism,
     Distribution&& distribution) {
   auto tensor =
       empty_strided(std::move(sizes), std::move(strides), type, dynamism);
   std::default_random_engine gen{std::random_device{}()};
 
-  ET_SWITCH_REALB_TYPES(type, nullptr, "random_strided", CTYPE, [&] {
+  ET_SWITCH_REALHBBF16_TYPES(type, nullptr, "random_strided", CTYPE, [&] {
     std::generate_n(tensor->mutable_data_ptr<CTYPE>(), tensor->numel(), [&]() {
       return static_cast<CTYPE>(distribution(gen));
     });
@@ -97,13 +100,13 @@ TensorPtr random_strided(
 } // namespace
 
 TensorPtr empty_strided(
-    std::vector<exec_aten::SizesType> sizes,
-    std::vector<exec_aten::StridesType> strides,
-    exec_aten::ScalarType type,
-    exec_aten::TensorShapeDynamism dynamism) {
+    std::vector<executorch::aten::SizesType> sizes,
+    std::vector<executorch::aten::StridesType> strides,
+    executorch::aten::ScalarType type,
+    executorch::aten::TensorShapeDynamism dynamism) {
   std::vector<uint8_t> data(
-      exec_aten::compute_numel(sizes.data(), sizes.size()) *
-      exec_aten::elementSize(type));
+      executorch::aten::compute_numel(sizes.data(), sizes.size()) *
+      executorch::aten::elementSize(type));
   return make_tensor_ptr(
       std::move(sizes),
       std::move(data),
@@ -114,14 +117,14 @@ TensorPtr empty_strided(
 }
 
 TensorPtr full_strided(
-    std::vector<exec_aten::SizesType> sizes,
-    std::vector<exec_aten::StridesType> strides,
-    exec_aten::Scalar fill_value,
-    exec_aten::ScalarType type,
-    exec_aten::TensorShapeDynamism dynamism) {
+    std::vector<executorch::aten::SizesType> sizes,
+    std::vector<executorch::aten::StridesType> strides,
+    executorch::aten::Scalar fill_value,
+    executorch::aten::ScalarType type,
+    executorch::aten::TensorShapeDynamism dynamism) {
   auto tensor =
       empty_strided(std::move(sizes), std::move(strides), type, dynamism);
-  ET_SWITCH_REALB_TYPES(type, nullptr, "full_strided", CTYPE, [&] {
+  ET_SWITCH_REALHBBF16_TYPES(type, nullptr, "full_strided", CTYPE, [&] {
     CTYPE value;
     ET_EXTRACT_SCALAR(fill_value, value);
     std::fill(
@@ -133,10 +136,10 @@ TensorPtr full_strided(
 }
 
 TensorPtr rand_strided(
-    std::vector<exec_aten::SizesType> sizes,
-    std::vector<exec_aten::StridesType> strides,
-    exec_aten::ScalarType type,
-    exec_aten::TensorShapeDynamism dynamism) {
+    std::vector<executorch::aten::SizesType> sizes,
+    std::vector<executorch::aten::StridesType> strides,
+    executorch::aten::ScalarType type,
+    executorch::aten::TensorShapeDynamism dynamism) {
   return random_strided(
       std::move(sizes),
       std::move(strides),
@@ -146,10 +149,10 @@ TensorPtr rand_strided(
 }
 
 TensorPtr randn_strided(
-    std::vector<exec_aten::SizesType> sizes,
-    std::vector<exec_aten::StridesType> strides,
-    exec_aten::ScalarType type,
-    exec_aten::TensorShapeDynamism dynamism) {
+    std::vector<executorch::aten::SizesType> sizes,
+    std::vector<executorch::aten::StridesType> strides,
+    executorch::aten::ScalarType type,
+    executorch::aten::TensorShapeDynamism dynamism) {
   return random_strided(
       std::move(sizes),
       std::move(strides),
@@ -161,10 +164,10 @@ TensorPtr randn_strided(
 TensorPtr randint_strided(
     int64_t low,
     int64_t high,
-    std::vector<exec_aten::SizesType> sizes,
-    std::vector<exec_aten::StridesType> strides,
-    exec_aten::ScalarType type,
-    exec_aten::TensorShapeDynamism dynamism) {
+    std::vector<executorch::aten::SizesType> sizes,
+    std::vector<executorch::aten::StridesType> strides,
+    executorch::aten::ScalarType type,
+    executorch::aten::TensorShapeDynamism dynamism) {
   return random_strided(
       std::move(sizes),
       std::move(strides),
diff --git a/extension/tensor/test/tensor_ptr_maker_test.cpp b/extension/tensor/test/tensor_ptr_maker_test.cpp
index 4bfc56338e..e17d18229d 100644
--- a/extension/tensor/test/tensor_ptr_maker_test.cpp
+++ b/extension/tensor/test/tensor_ptr_maker_test.cpp
@@ -24,11 +24,12 @@ class TensorPtrMakerTest : public ::testing::Test {
 
 TEST_F(TensorPtrMakerTest, CreateTensorUsingTensorMaker) {
   float data[20] = {2};
-  auto tensor = for_blob(data, {4, 5})
-                    .dim_order({0, 1})
-                    .strides({5, 1})
-                    .dynamism(exec_aten::TensorShapeDynamism::DYNAMIC_BOUND)
-                    .make_tensor_ptr();
+  auto tensor =
+      for_blob(data, {4, 5})
+          .dim_order({0, 1})
+          .strides({5, 1})
+          .dynamism(executorch::aten::TensorShapeDynamism::DYNAMIC_BOUND)
+          .make_tensor_ptr();
 
   EXPECT_EQ(tensor->dim(), 2);
   EXPECT_EQ(tensor->size(0), 4);
@@ -41,9 +42,9 @@ TEST_F(TensorPtrMakerTest, CreateTensorUsingTensorMaker) {
 
 TEST_F(TensorPtrMakerTest, PerfectForwardingLValue) {
   float data[20] = {2};
-  std::vector<exec_aten::SizesType> sizes = {4, 5};
-  std::vector<exec_aten::DimOrderType> dim_order = {0, 1};
-  std::vector<exec_aten::StridesType> strides = {5, 1};
+  std::vector<executorch::aten::SizesType> sizes = {4, 5};
+  std::vector<executorch::aten::DimOrderType> dim_order = {0, 1};
+  std::vector<executorch::aten::StridesType> strides = {5, 1};
 
   auto tensor = for_blob(data, sizes)
                     .dim_order(dim_order)
@@ -63,9 +64,9 @@ TEST_F(TensorPtrMakerTest, PerfectForwardingLValue) {
 
 TEST_F(TensorPtrMakerTest, PerfectForwardingRValue) {
   float data[20] = {2};
-  std::vector<exec_aten::SizesType> sizes = {4, 5};
-  std::vector<exec_aten::DimOrderType> dim_order = {0, 1};
-  std::vector<exec_aten::StridesType> strides = {5, 1};
+  std::vector<executorch::aten::SizesType> sizes = {4, 5};
+  std::vector<executorch::aten::DimOrderType> dim_order = {0, 1};
+  std::vector<executorch::aten::StridesType> strides = {5, 1};
 
   auto tensor = for_blob(data, std::move(sizes))
                     .dim_order(std::move(dim_order))
@@ -116,7 +117,7 @@ TEST_F(TensorPtrMakerTest, TensorMakerConversionOperator) {
   float data[20] = {2};
   TensorPtr tensor =
       for_blob(data, {4, 5})
-          .dynamism(exec_aten::TensorShapeDynamism::DYNAMIC_BOUND);
+          .dynamism(executorch::aten::TensorShapeDynamism::DYNAMIC_BOUND);
 
   EXPECT_EQ(tensor->dim(), 2);
   EXPECT_EQ(tensor->size(0), 4);
@@ -169,7 +170,7 @@ TEST_F(TensorPtrMakerTest, TensorDeleterReleasesCapturedSharedPtr) {
   auto tensor = from_blob(
       data_ptr.get(),
       {4, 5},
-      exec_aten::ScalarType::Float,
+      executorch::aten::ScalarType::Float,
       [data_ptr, &deleter_called](void*) mutable { deleter_called = true; });
 
   EXPECT_EQ(data_ptr.use_count(), 2);
@@ -184,25 +185,25 @@ TEST_F(TensorPtrMakerTest, CreateEmpty) {
   EXPECT_EQ(tensor->dim(), 2);
   EXPECT_EQ(tensor->size(0), 4);
   EXPECT_EQ(tensor->size(1), 5);
-  EXPECT_EQ(tensor->scalar_type(), exec_aten::ScalarType::Float);
+  EXPECT_EQ(tensor->scalar_type(), executorch::aten::ScalarType::Float);
 
-  auto tensor2 = empty({4, 5}, exec_aten::ScalarType::Int);
+  auto tensor2 = empty({4, 5}, executorch::aten::ScalarType::Int);
   EXPECT_EQ(tensor2->dim(), 2);
   EXPECT_EQ(tensor2->size(0), 4);
   EXPECT_EQ(tensor2->size(1), 5);
-  EXPECT_EQ(tensor2->scalar_type(), exec_aten::ScalarType::Int);
+  EXPECT_EQ(tensor2->scalar_type(), executorch::aten::ScalarType::Int);
 
-  auto tensor3 = empty({4, 5}, exec_aten::ScalarType::Long);
+  auto tensor3 = empty({4, 5}, executorch::aten::ScalarType::Long);
   EXPECT_EQ(tensor3->dim(), 2);
   EXPECT_EQ(tensor3->size(0), 4);
   EXPECT_EQ(tensor3->size(1), 5);
-  EXPECT_EQ(tensor3->scalar_type(), exec_aten::ScalarType::Long);
+  EXPECT_EQ(tensor3->scalar_type(), executorch::aten::ScalarType::Long);
 
-  auto tensor4 = empty({4, 5}, exec_aten::ScalarType::Double);
+  auto tensor4 = empty({4, 5}, executorch::aten::ScalarType::Double);
   EXPECT_EQ(tensor4->dim(), 2);
   EXPECT_EQ(tensor4->size(0), 4);
   EXPECT_EQ(tensor4->size(1), 5);
-  EXPECT_EQ(tensor4->scalar_type(), exec_aten::ScalarType::Double);
+  EXPECT_EQ(tensor4->scalar_type(), executorch::aten::ScalarType::Double);
 }
 
 TEST_F(TensorPtrMakerTest, CreateFull) {
@@ -210,29 +211,43 @@ TEST_F(TensorPtrMakerTest, CreateFull) {
   EXPECT_EQ(tensor->dim(), 2);
   EXPECT_EQ(tensor->size(0), 4);
   EXPECT_EQ(tensor->size(1), 5);
-  EXPECT_EQ(tensor->scalar_type(), exec_aten::ScalarType::Float);
+  EXPECT_EQ(tensor->scalar_type(), executorch::aten::ScalarType::Float);
   EXPECT_EQ(tensor->const_data_ptr<float>()[0], 7);
 
-  auto tensor2 = full({4, 5}, 3, exec_aten::ScalarType::Int);
+  auto tensor2 = full({4, 5}, 3, executorch::aten::ScalarType::Int);
   EXPECT_EQ(tensor2->dim(), 2);
   EXPECT_EQ(tensor2->size(0), 4);
   EXPECT_EQ(tensor2->size(1), 5);
-  EXPECT_EQ(tensor2->scalar_type(), exec_aten::ScalarType::Int);
+  EXPECT_EQ(tensor2->scalar_type(), executorch::aten::ScalarType::Int);
   EXPECT_EQ(tensor2->const_data_ptr<int32_t>()[0], 3);
 
-  auto tensor3 = full({4, 5}, 9, exec_aten::ScalarType::Long);
+  auto tensor3 = full({4, 5}, 9, executorch::aten::ScalarType::Long);
   EXPECT_EQ(tensor3->dim(), 2);
   EXPECT_EQ(tensor3->size(0), 4);
   EXPECT_EQ(tensor3->size(1), 5);
-  EXPECT_EQ(tensor3->scalar_type(), exec_aten::ScalarType::Long);
+  EXPECT_EQ(tensor3->scalar_type(), executorch::aten::ScalarType::Long);
   EXPECT_EQ(tensor3->const_data_ptr<int64_t>()[0], 9);
 
-  auto tensor4 = full({4, 5}, 11, exec_aten::ScalarType::Double);
+  auto tensor4 = full({4, 5}, 11, executorch::aten::ScalarType::Double);
   EXPECT_EQ(tensor4->dim(), 2);
   EXPECT_EQ(tensor4->size(0), 4);
   EXPECT_EQ(tensor4->size(1), 5);
-  EXPECT_EQ(tensor4->scalar_type(), exec_aten::ScalarType::Double);
+  EXPECT_EQ(tensor4->scalar_type(), executorch::aten::ScalarType::Double);
   EXPECT_EQ(tensor4->const_data_ptr<double>()[0], 11);
+
+  auto tensor5 = full({4, 5}, 13, executorch::aten::ScalarType::Half);
+  EXPECT_EQ(tensor5->dim(), 2);
+  EXPECT_EQ(tensor5->size(0), 4);
+  EXPECT_EQ(tensor5->size(1), 5);
+  EXPECT_EQ(tensor5->scalar_type(), executorch::aten::ScalarType::Half);
+  EXPECT_EQ(tensor5->const_data_ptr<executorch::aten::Half>()[0], 13);
+
+  auto tensor6 = full({4, 5}, 15, executorch::aten::ScalarType::BFloat16);
+  EXPECT_EQ(tensor6->dim(), 2);
+  EXPECT_EQ(tensor6->size(0), 4);
+  EXPECT_EQ(tensor6->size(1), 5);
+  EXPECT_EQ(tensor6->scalar_type(), executorch::aten::ScalarType::BFloat16);
+  EXPECT_EQ(tensor6->const_data_ptr<executorch::aten::BFloat16>()[0], 15);
 }
 
 TEST_F(TensorPtrMakerTest, CreateScalar) {
@@ -240,21 +255,21 @@ TEST_F(TensorPtrMakerTest, CreateScalar) {
 
   EXPECT_EQ(tensor->dim(), 0);
   EXPECT_EQ(tensor->numel(), 1);
-  EXPECT_EQ(tensor->scalar_type(), exec_aten::ScalarType::Float);
+  EXPECT_EQ(tensor->scalar_type(), executorch::aten::ScalarType::Float);
   EXPECT_EQ(tensor->const_data_ptr<float>()[0], 3.14f);
 
-  auto tensor2 = scalar_tensor(5, exec_aten::ScalarType::Int);
+  auto tensor2 = scalar_tensor(5, executorch::aten::ScalarType::Int);
 
   EXPECT_EQ(tensor2->dim(), 0);
   EXPECT_EQ(tensor2->numel(), 1);
-  EXPECT_EQ(tensor2->scalar_type(), exec_aten::ScalarType::Int);
+  EXPECT_EQ(tensor2->scalar_type(), executorch::aten::ScalarType::Int);
   EXPECT_EQ(tensor2->const_data_ptr<int32_t>()[0], 5);
 
-  auto tensor3 = scalar_tensor(7.0, exec_aten::ScalarType::Double);
+  auto tensor3 = scalar_tensor(7.0, executorch::aten::ScalarType::Double);
 
   EXPECT_EQ(tensor3->dim(), 0);
   EXPECT_EQ(tensor3->numel(), 1);
-  EXPECT_EQ(tensor3->scalar_type(), exec_aten::ScalarType::Double);
+  EXPECT_EQ(tensor3->scalar_type(), executorch::aten::ScalarType::Double);
   EXPECT_EQ(tensor3->const_data_ptr<double>()[0], 7.0);
 }
 
@@ -263,28 +278,28 @@ TEST_F(TensorPtrMakerTest, CreateOnes) {
   EXPECT_EQ(tensor->dim(), 2);
   EXPECT_EQ(tensor->size(0), 4);
   EXPECT_EQ(tensor->size(1), 5);
-  EXPECT_EQ(tensor->scalar_type(), exec_aten::ScalarType::Float);
+  EXPECT_EQ(tensor->scalar_type(), executorch::aten::ScalarType::Float);
   EXPECT_EQ(tensor->const_data_ptr<float>()[0], 1);
 
-  auto tensor2 = ones({4, 5}, exec_aten::ScalarType::Int);
+  auto tensor2 = ones({4, 5}, executorch::aten::ScalarType::Int);
   EXPECT_EQ(tensor2->dim(), 2);
   EXPECT_EQ(tensor2->size(0), 4);
   EXPECT_EQ(tensor2->size(1), 5);
-  EXPECT_EQ(tensor2->scalar_type(), exec_aten::ScalarType::Int);
+  EXPECT_EQ(tensor2->scalar_type(), executorch::aten::ScalarType::Int);
   EXPECT_EQ(tensor2->const_data_ptr<int32_t>()[0], 1);
 
-  auto tensor3 = ones({4, 5}, exec_aten::ScalarType::Long);
+  auto tensor3 = ones({4, 5}, executorch::aten::ScalarType::Long);
   EXPECT_EQ(tensor3->dim(), 2);
   EXPECT_EQ(tensor3->size(0), 4);
   EXPECT_EQ(tensor3->size(1), 5);
-  EXPECT_EQ(tensor3->scalar_type(), exec_aten::ScalarType::Long);
+  EXPECT_EQ(tensor3->scalar_type(), executorch::aten::ScalarType::Long);
   EXPECT_EQ(tensor3->const_data_ptr<int64_t>()[0], 1);
 
-  auto tensor4 = ones({4, 5}, exec_aten::ScalarType::Double);
+  auto tensor4 = ones({4, 5}, executorch::aten::ScalarType::Double);
   EXPECT_EQ(tensor4->dim(), 2);
   EXPECT_EQ(tensor4->size(0), 4);
   EXPECT_EQ(tensor4->size(1), 5);
-  EXPECT_EQ(tensor4->scalar_type(), exec_aten::ScalarType::Double);
+  EXPECT_EQ(tensor4->scalar_type(), executorch::aten::ScalarType::Double);
   EXPECT_EQ(tensor4->const_data_ptr<double>()[0], 1);
 }
 
@@ -293,28 +308,28 @@ TEST_F(TensorPtrMakerTest, CreateZeros) {
   EXPECT_EQ(tensor->dim(), 2);
   EXPECT_EQ(tensor->size(0), 4);
   EXPECT_EQ(tensor->size(1), 5);
-  EXPECT_EQ(tensor->scalar_type(), exec_aten::ScalarType::Float);
+  EXPECT_EQ(tensor->scalar_type(), executorch::aten::ScalarType::Float);
   EXPECT_EQ(tensor->const_data_ptr<float>()[0], 0);
 
-  auto tensor2 = zeros({4, 5}, exec_aten::ScalarType::Int);
+  auto tensor2 = zeros({4, 5}, executorch::aten::ScalarType::Int);
   EXPECT_EQ(tensor2->dim(), 2);
   EXPECT_EQ(tensor2->size(0), 4);
   EXPECT_EQ(tensor2->size(1), 5);
-  EXPECT_EQ(tensor2->scalar_type(), exec_aten::ScalarType::Int);
+  EXPECT_EQ(tensor2->scalar_type(), executorch::aten::ScalarType::Int);
   EXPECT_EQ(tensor2->const_data_ptr<int32_t>()[0], 0);
 
-  auto tensor3 = zeros({4, 5}, exec_aten::ScalarType::Long);
+  auto tensor3 = zeros({4, 5}, executorch::aten::ScalarType::Long);
   EXPECT_EQ(tensor3->dim(), 2);
   EXPECT_EQ(tensor3->size(0), 4);
   EXPECT_EQ(tensor3->size(1), 5);
-  EXPECT_EQ(tensor3->scalar_type(), exec_aten::ScalarType::Long);
+  EXPECT_EQ(tensor3->scalar_type(), executorch::aten::ScalarType::Long);
   EXPECT_EQ(tensor3->const_data_ptr<int64_t>()[0], 0);
 
-  auto tensor4 = zeros({4, 5}, exec_aten::ScalarType::Double);
+  auto tensor4 = zeros({4, 5}, executorch::aten::ScalarType::Double);
   EXPECT_EQ(tensor4->dim(), 2);
   EXPECT_EQ(tensor4->size(0), 4);
   EXPECT_EQ(tensor4->size(1), 5);
-  EXPECT_EQ(tensor4->scalar_type(), exec_aten::ScalarType::Double);
+  EXPECT_EQ(tensor4->scalar_type(), executorch::aten::ScalarType::Double);
   EXPECT_EQ(tensor4->const_data_ptr<double>()[0], 0);
 }
 
@@ -324,7 +339,7 @@ TEST_F(TensorPtrMakerTest, CreateRandTensor) {
   EXPECT_EQ(tensor->dim(), 2);
   EXPECT_EQ(tensor->size(0), 4);
   EXPECT_EQ(tensor->size(1), 5);
-  EXPECT_EQ(tensor->scalar_type(), exec_aten::ScalarType::Float);
+  EXPECT_EQ(tensor->scalar_type(), executorch::aten::ScalarType::Float);
 
   for (auto i = 0; i < tensor->numel(); ++i) {
     auto val = tensor->const_data_ptr<float>()[i];
@@ -334,12 +349,12 @@ TEST_F(TensorPtrMakerTest, CreateRandTensor) {
 }
 
 TEST_F(TensorPtrMakerTest, CreateRandTensorWithIntType) {
-  auto tensor = rand({4, 5}, exec_aten::ScalarType::Int);
+  auto tensor = rand({4, 5}, executorch::aten::ScalarType::Int);
 
   EXPECT_EQ(tensor->dim(), 2);
   EXPECT_EQ(tensor->size(0), 4);
   EXPECT_EQ(tensor->size(1), 5);
-  EXPECT_EQ(tensor->scalar_type(), exec_aten::ScalarType::Int);
+  EXPECT_EQ(tensor->scalar_type(), executorch::aten::ScalarType::Int);
 
   for (auto i = 0; i < tensor->numel(); ++i) {
     auto val = tensor->const_data_ptr<int32_t>()[i];
@@ -348,12 +363,12 @@ TEST_F(TensorPtrMakerTest, CreateRandTensorWithIntType) {
 }
 
 TEST_F(TensorPtrMakerTest, CreateRandTensorWithDoubleType) {
-  auto tensor = rand({4, 5}, exec_aten::ScalarType::Double);
+  auto tensor = rand({4, 5}, executorch::aten::ScalarType::Double);
 
   EXPECT_EQ(tensor->dim(), 2);
   EXPECT_EQ(tensor->size(0), 4);
   EXPECT_EQ(tensor->size(1), 5);
-  EXPECT_EQ(tensor->scalar_type(), exec_aten::ScalarType::Double);
+  EXPECT_EQ(tensor->scalar_type(), executorch::aten::ScalarType::Double);
 
   for (auto i = 0; i < tensor->numel(); ++i) {
     auto val = tensor->const_data_ptr<double>()[i];
@@ -362,13 +377,43 @@ TEST_F(TensorPtrMakerTest, CreateRandTensorWithDoubleType) {
   }
 }
 
+TEST_F(TensorPtrMakerTest, CreateRandTensorWithHalfType) {
+  auto tensor = rand({4, 5}, executorch::aten::ScalarType::Half);
+
+  EXPECT_EQ(tensor->dim(), 2);
+  EXPECT_EQ(tensor->size(0), 4);
+  EXPECT_EQ(tensor->size(1), 5);
+  EXPECT_EQ(tensor->scalar_type(), executorch::aten::ScalarType::Half);
+
+  for (auto i = 0; i < tensor->numel(); ++i) {
+    auto val = tensor->const_data_ptr<executorch::aten::Half>()[i];
+    EXPECT_GE(val, 0.0);
+    EXPECT_LT(val, 1.0);
+  }
+}
+
+TEST_F(TensorPtrMakerTest, CreateRandTensorWithBFloatType) {
+  auto tensor = rand({4, 5}, executorch::aten::ScalarType::BFloat16);
+
+  EXPECT_EQ(tensor->dim(), 2);
+  EXPECT_EQ(tensor->size(0), 4);
+  EXPECT_EQ(tensor->size(1), 5);
+  EXPECT_EQ(tensor->scalar_type(), executorch::aten::ScalarType::BFloat16);
+
+  for (auto i = 0; i < tensor->numel(); ++i) {
+    auto val = tensor->const_data_ptr<executorch::aten::BFloat16>()[i];
+    EXPECT_GE(val, 0.0);
+    EXPECT_LT(val, 1.0);
+  }
+}
+
 TEST_F(TensorPtrMakerTest, CreateRandnTensor) {
   auto tensor = randn({100, 100});
 
   EXPECT_EQ(tensor->dim(), 2);
   EXPECT_EQ(tensor->size(0), 100);
   EXPECT_EQ(tensor->size(1), 100);
-  EXPECT_EQ(tensor->scalar_type(), exec_aten::ScalarType::Float);
+  EXPECT_EQ(tensor->scalar_type(), executorch::aten::ScalarType::Float);
 
   auto sum = 0.0f;
   for (auto i = 0; i < tensor->numel(); ++i) {
@@ -379,12 +424,12 @@ TEST_F(TensorPtrMakerTest, CreateRandnTensor) {
 }
 
 TEST_F(TensorPtrMakerTest, CreateRandnTensorWithDoubleType) {
-  auto tensor = randn({100, 100}, exec_aten::ScalarType::Double);
+  auto tensor = randn({100, 100}, executorch::aten::ScalarType::Double);
 
   EXPECT_EQ(tensor->dim(), 2);
   EXPECT_EQ(tensor->size(0), 100);
   EXPECT_EQ(tensor->size(1), 100);
-  EXPECT_EQ(tensor->scalar_type(), exec_aten::ScalarType::Double);
+  EXPECT_EQ(tensor->scalar_type(), executorch::aten::ScalarType::Double);
 
   auto sum = 0.0;
   for (auto i = 0; i < tensor->numel(); ++i) {
@@ -395,12 +440,12 @@ TEST_F(TensorPtrMakerTest, CreateRandnTensorWithDoubleType) {
 }
 
 TEST_F(TensorPtrMakerTest, CreateRandIntTensorWithIntType) {
-  auto tensor = randint(10, 20, {4, 5}, exec_aten::ScalarType::Int);
+  auto tensor = randint(10, 20, {4, 5}, executorch::aten::ScalarType::Int);
 
   EXPECT_EQ(tensor->dim(), 2);
   EXPECT_EQ(tensor->size(0), 4);
   EXPECT_EQ(tensor->size(1), 5);
-  EXPECT_EQ(tensor->scalar_type(), exec_aten::ScalarType::Int);
+  EXPECT_EQ(tensor->scalar_type(), executorch::aten::ScalarType::Int);
 
   for (auto i = 0; i < tensor->numel(); ++i) {
     auto val = tensor->const_data_ptr<int32_t>()[i];
@@ -410,12 +455,12 @@ TEST_F(TensorPtrMakerTest, CreateRandIntTensorWithIntType) {
 }
 
 TEST_F(TensorPtrMakerTest, CreateRandIntTensorWithLongType) {
-  auto tensor = randint(10, 20, {4, 5}, exec_aten::ScalarType::Long);
+  auto tensor = randint(10, 20, {4, 5}, executorch::aten::ScalarType::Long);
 
   EXPECT_EQ(tensor->dim(), 2);
   EXPECT_EQ(tensor->size(0), 4);
   EXPECT_EQ(tensor->size(1), 5);
-  EXPECT_EQ(tensor->scalar_type(), exec_aten::ScalarType::Long);
+  EXPECT_EQ(tensor->scalar_type(), executorch::aten::ScalarType::Long);
 
   for (auto i = 0; i < tensor->numel(); ++i) {
     auto val = tensor->const_data_ptr<int64_t>()[i];
@@ -425,12 +470,12 @@ TEST_F(TensorPtrMakerTest, CreateRandIntTensorWithLongType) {
 }
 
 TEST_F(TensorPtrMakerTest, CreateRandnTensorWithIntType) {
-  auto tensor = rand({4, 5}, exec_aten::ScalarType::Int);
+  auto tensor = rand({4, 5}, executorch::aten::ScalarType::Int);
 
   EXPECT_EQ(tensor->dim(), 2);
   EXPECT_EQ(tensor->size(0), 4);
   EXPECT_EQ(tensor->size(1), 5);
-  EXPECT_EQ(tensor->scalar_type(), exec_aten::ScalarType::Int);
+  EXPECT_EQ(tensor->scalar_type(), executorch::aten::ScalarType::Int);
 
   for (auto i = 0; i < tensor->numel(); ++i) {
     auto val = tensor->const_data_ptr<int32_t>()[i];
diff --git a/extension/tensor/test/tensor_ptr_test.cpp b/extension/tensor/test/tensor_ptr_test.cpp
index b9f8b9a2a7..99c4f1b0d1 100644
--- a/extension/tensor/test/tensor_ptr_test.cpp
+++ b/extension/tensor/test/tensor_ptr_test.cpp
@@ -52,7 +52,7 @@ TEST_F(TensorPtrTest, ScalarTensorSingleValueCreation) {
   EXPECT_EQ(tensor_float->sizes().size(), 0);
   EXPECT_EQ(tensor_float->strides().size(), 0);
   EXPECT_EQ(tensor_float->const_data_ptr<float>()[0], 3.14f);
-  EXPECT_EQ(tensor_float->scalar_type(), exec_aten::ScalarType::Float);
+  EXPECT_EQ(tensor_float->scalar_type(), executorch::aten::ScalarType::Float);
 
   auto tensor_int32 = make_tensor_ptr(42);
   EXPECT_EQ(tensor_int32->dim(), 0);
@@ -60,7 +60,7 @@ TEST_F(TensorPtrTest, ScalarTensorSingleValueCreation) {
   EXPECT_EQ(tensor_int32->sizes().size(), 0);
   EXPECT_EQ(tensor_int32->strides().size(), 0);
   EXPECT_EQ(tensor_int32->const_data_ptr<int32_t>()[0], 42);
-  EXPECT_EQ(tensor_int32->scalar_type(), exec_aten::ScalarType::Int);
+  EXPECT_EQ(tensor_int32->scalar_type(), executorch::aten::ScalarType::Int);
 
   auto tensor_double = make_tensor_ptr(2.718);
   EXPECT_EQ(tensor_double->dim(), 0);
@@ -68,7 +68,7 @@ TEST_F(TensorPtrTest, ScalarTensorSingleValueCreation) {
   EXPECT_EQ(tensor_double->sizes().size(), 0);
   EXPECT_EQ(tensor_double->strides().size(), 0);
   EXPECT_EQ(tensor_double->const_data_ptr<double>()[0], 2.718);
-  EXPECT_EQ(tensor_double->scalar_type(), exec_aten::ScalarType::Double);
+  EXPECT_EQ(tensor_double->scalar_type(), executorch::aten::ScalarType::Double);
 
   auto tensor_int64 = make_tensor_ptr(static_cast<int64_t>(10000000000));
   EXPECT_EQ(tensor_int64->dim(), 0);
@@ -76,7 +76,7 @@ TEST_F(TensorPtrTest, ScalarTensorSingleValueCreation) {
   EXPECT_EQ(tensor_int64->sizes().size(), 0);
   EXPECT_EQ(tensor_int64->strides().size(), 0);
   EXPECT_EQ(tensor_int64->const_data_ptr<int64_t>()[0], 10000000000);
-  EXPECT_EQ(tensor_int64->scalar_type(), exec_aten::ScalarType::Long);
+  EXPECT_EQ(tensor_int64->scalar_type(), executorch::aten::ScalarType::Long);
 }
 
 TEST_F(TensorPtrTest, CreateTensorWithStridesAndDimOrder) {
@@ -128,8 +128,8 @@ TEST_F(TensorPtrTest, TensorResize) {
       data,
       {},
       {},
-      exec_aten::ScalarType::Float,
-      exec_aten::TensorShapeDynamism::DYNAMIC_UNBOUND);
+      executorch::aten::ScalarType::Float,
+      executorch::aten::TensorShapeDynamism::DYNAMIC_UNBOUND);
   EXPECT_EQ(resize_tensor_ptr(tensor, {5, 4}), Error::Ok);
   EXPECT_EQ(tensor->size(0), 5);
   EXPECT_EQ(tensor->size(1), 4);
@@ -152,8 +152,8 @@ TEST_F(TensorPtrTest, TensorWithCustomDataDeleter) {
       data,
       {},
       {},
-      exec_aten::ScalarType::Float,
-      exec_aten::TensorShapeDynamism::DYNAMIC_BOUND,
+      executorch::aten::ScalarType::Float,
+      executorch::aten::TensorShapeDynamism::DYNAMIC_BOUND,
       [&deleter_called](void* ptr) {
         deleter_called = true;
         delete[] static_cast<float*>(ptr);
@@ -172,8 +172,8 @@ TEST_F(TensorPtrTest, TensorManagesMovedVector) {
       data_ptr,
       {},
       {},
-      exec_aten::ScalarType::Float,
-      exec_aten::TensorShapeDynamism::DYNAMIC_BOUND,
+      executorch::aten::ScalarType::Float,
+      executorch::aten::TensorShapeDynamism::DYNAMIC_BOUND,
       [moved_data = std::move(data), &deleter_called](void*) mutable {
         deleter_called = true;
       });
@@ -194,8 +194,8 @@ TEST_F(TensorPtrTest, TensorDeleterReleasesCapturedSharedPtr) {
       data_ptr.get(),
       {},
       {},
-      exec_aten::ScalarType::Float,
-      exec_aten::TensorShapeDynamism::DYNAMIC_BOUND,
+      executorch::aten::ScalarType::Float,
+      executorch::aten::TensorShapeDynamism::DYNAMIC_BOUND,
       [data_ptr, &deleter_called](void*) mutable { deleter_called = true; });
 
   EXPECT_EQ(data_ptr.use_count(), 2);
@@ -230,7 +230,7 @@ TEST_F(TensorPtrTest, TensorOwningEmptyData) {
   EXPECT_EQ(tensor->strides()[0], 5);
   EXPECT_EQ(tensor->strides()[1], 1);
   EXPECT_EQ(tensor->data_ptr<float>(), nullptr);
-  EXPECT_EQ(tensor->scalar_type(), exec_aten::ScalarType::Float);
+  EXPECT_EQ(tensor->scalar_type(), executorch::aten::ScalarType::Float);
 }
 
 TEST_F(TensorPtrTest, TensorDataOnly) {
@@ -241,7 +241,7 @@ TEST_F(TensorPtrTest, TensorDataOnly) {
   EXPECT_EQ(tensor->strides()[0], 1);
   EXPECT_EQ(tensor->const_data_ptr<float>()[0], 1.0);
   EXPECT_EQ(tensor->const_data_ptr<float>()[3], 4.0);
-  EXPECT_EQ(tensor->scalar_type(), exec_aten::ScalarType::Float);
+  EXPECT_EQ(tensor->scalar_type(), executorch::aten::ScalarType::Float);
 }
 
 TEST_F(TensorPtrTest, TensorDataOnlyDoubleType) {
@@ -253,7 +253,7 @@ TEST_F(TensorPtrTest, TensorDataOnlyDoubleType) {
   EXPECT_EQ(tensor->strides()[0], 1);
   EXPECT_EQ(tensor->const_data_ptr<double>()[0], 1.0);
   EXPECT_EQ(tensor->const_data_ptr<double>()[3], 4.0);
-  EXPECT_EQ(tensor->scalar_type(), exec_aten::ScalarType::Double);
+  EXPECT_EQ(tensor->scalar_type(), executorch::aten::ScalarType::Double);
 }
 
 TEST_F(TensorPtrTest, TensorDataOnlyInt32Type) {
@@ -265,7 +265,7 @@ TEST_F(TensorPtrTest, TensorDataOnlyInt32Type) {
   EXPECT_EQ(tensor->strides()[0], 1);
   EXPECT_EQ(tensor->const_data_ptr<int32_t>()[0], 10);
   EXPECT_EQ(tensor->const_data_ptr<int32_t>()[3], 40);
-  EXPECT_EQ(tensor->scalar_type(), exec_aten::ScalarType::Int);
+  EXPECT_EQ(tensor->scalar_type(), executorch::aten::ScalarType::Int);
 }
 
 TEST_F(TensorPtrTest, TensorDataOnlyInt64Type) {
@@ -277,7 +277,7 @@ TEST_F(TensorPtrTest, TensorDataOnlyInt64Type) {
   EXPECT_EQ(tensor->strides()[0], 1);
   EXPECT_EQ(tensor->const_data_ptr<int64_t>()[0], 100);
   EXPECT_EQ(tensor->const_data_ptr<int64_t>()[3], 400);
-  EXPECT_EQ(tensor->scalar_type(), exec_aten::ScalarType::Long);
+  EXPECT_EQ(tensor->scalar_type(), executorch::aten::ScalarType::Long);
 }
 
 TEST_F(TensorPtrTest, TensorDataOnlyUint8Type) {
@@ -289,11 +289,11 @@ TEST_F(TensorPtrTest, TensorDataOnlyUint8Type) {
   EXPECT_EQ(tensor->strides()[0], 1);
   EXPECT_EQ(tensor->const_data_ptr<uint8_t>()[0], 10);
   EXPECT_EQ(tensor->const_data_ptr<uint8_t>()[3], 40);
-  EXPECT_EQ(tensor->scalar_type(), exec_aten::ScalarType::Byte);
+  EXPECT_EQ(tensor->scalar_type(), executorch::aten::ScalarType::Byte);
 }
 
 TEST_F(TensorPtrTest, TensorAmbiguityWithMixedVectors) {
-  std::vector<exec_aten::SizesType> sizes = {2, 2};
+  std::vector<executorch::aten::SizesType> sizes = {2, 2};
   std::vector<float> data = {1.0f, 2.0f, 3.0f, 4.0f};
   auto tensor = make_tensor_ptr(std::move(sizes), std::move(data));
 
@@ -354,7 +354,7 @@ TEST_F(TensorPtrTest, MakeTensorPtrFromExistingTensorInt32) {
   EXPECT_EQ(new_tensor->size(1), tensor->size(1));
   EXPECT_EQ(
       new_tensor->const_data_ptr<int32_t>(), tensor->const_data_ptr<int32_t>());
-  EXPECT_EQ(new_tensor->scalar_type(), exec_aten::ScalarType::Int);
+  EXPECT_EQ(new_tensor->scalar_type(), executorch::aten::ScalarType::Int);
 }
 
 TEST_F(TensorPtrTest, CloneTensorPtrFromExistingTensorInt32) {
@@ -370,7 +370,7 @@ TEST_F(TensorPtrTest, CloneTensorPtrFromExistingTensorInt32) {
       tensor->const_data_ptr<int32_t>());
   EXPECT_EQ(cloned_tensor->const_data_ptr<int32_t>()[0], 1);
   EXPECT_EQ(cloned_tensor->const_data_ptr<int32_t>()[3], 4);
-  EXPECT_EQ(cloned_tensor->scalar_type(), exec_aten::ScalarType::Int);
+  EXPECT_EQ(cloned_tensor->scalar_type(), executorch::aten::ScalarType::Int);
 }
 
 TEST_F(TensorPtrTest, CloneTensorPtrFromTensorPtrInt32) {
@@ -386,7 +386,7 @@ TEST_F(TensorPtrTest, CloneTensorPtrFromTensorPtrInt32) {
       tensor->const_data_ptr<int32_t>());
   EXPECT_EQ(cloned_tensor->const_data_ptr<int32_t>()[0], 1);
   EXPECT_EQ(cloned_tensor->const_data_ptr<int32_t>()[3], 4);
-  EXPECT_EQ(cloned_tensor->scalar_type(), exec_aten::ScalarType::Int);
+  EXPECT_EQ(cloned_tensor->scalar_type(), executorch::aten::ScalarType::Int);
 }
 
 TEST_F(TensorPtrTest, MakeTensorPtrFromExistingTensorDouble) {
@@ -399,7 +399,7 @@ TEST_F(TensorPtrTest, MakeTensorPtrFromExistingTensorDouble) {
   EXPECT_EQ(new_tensor->size(1), tensor->size(1));
   EXPECT_EQ(
       new_tensor->const_data_ptr<double>(), tensor->const_data_ptr<double>());
-  EXPECT_EQ(new_tensor->scalar_type(), exec_aten::ScalarType::Double);
+  EXPECT_EQ(new_tensor->scalar_type(), executorch::aten::ScalarType::Double);
 }
 
 TEST_F(TensorPtrTest, CloneTensorPtrFromExistingTensorDouble) {
@@ -415,7 +415,7 @@ TEST_F(TensorPtrTest, CloneTensorPtrFromExistingTensorDouble) {
       tensor->const_data_ptr<double>());
   EXPECT_EQ(cloned_tensor->const_data_ptr<double>()[0], 1.0);
   EXPECT_EQ(cloned_tensor->const_data_ptr<double>()[3], 4.0);
-  EXPECT_EQ(cloned_tensor->scalar_type(), exec_aten::ScalarType::Double);
+  EXPECT_EQ(cloned_tensor->scalar_type(), executorch::aten::ScalarType::Double);
 }
 
 TEST_F(TensorPtrTest, CloneTensorPtrFromTensorPtrDouble) {
@@ -431,7 +431,7 @@ TEST_F(TensorPtrTest, CloneTensorPtrFromTensorPtrDouble) {
       tensor->const_data_ptr<double>());
   EXPECT_EQ(cloned_tensor->const_data_ptr<double>()[0], 1.0);
   EXPECT_EQ(cloned_tensor->const_data_ptr<double>()[3], 4.0);
-  EXPECT_EQ(cloned_tensor->scalar_type(), exec_aten::ScalarType::Double);
+  EXPECT_EQ(cloned_tensor->scalar_type(), executorch::aten::ScalarType::Double);
 }
 
 TEST_F(TensorPtrTest, MakeTensorPtrFromExistingTensorInt64) {
@@ -444,7 +444,7 @@ TEST_F(TensorPtrTest, MakeTensorPtrFromExistingTensorInt64) {
   EXPECT_EQ(new_tensor->size(1), tensor->size(1));
   EXPECT_EQ(
       new_tensor->const_data_ptr<int64_t>(), tensor->const_data_ptr<int64_t>());
-  EXPECT_EQ(new_tensor->scalar_type(), exec_aten::ScalarType::Long);
+  EXPECT_EQ(new_tensor->scalar_type(), executorch::aten::ScalarType::Long);
 }
 
 TEST_F(TensorPtrTest, CloneTensorPtrFromExistingTensorInt64) {
@@ -460,7 +460,7 @@ TEST_F(TensorPtrTest, CloneTensorPtrFromExistingTensorInt64) {
       tensor->const_data_ptr<int64_t>());
   EXPECT_EQ(cloned_tensor->const_data_ptr<int64_t>()[0], 100);
   EXPECT_EQ(cloned_tensor->const_data_ptr<int64_t>()[3], 400);
-  EXPECT_EQ(cloned_tensor->scalar_type(), exec_aten::ScalarType::Long);
+  EXPECT_EQ(cloned_tensor->scalar_type(), executorch::aten::ScalarType::Long);
 }
 
 TEST_F(TensorPtrTest, CloneTensorPtrFromTensorPtrInt64) {
@@ -476,7 +476,7 @@ TEST_F(TensorPtrTest, CloneTensorPtrFromTensorPtrInt64) {
       tensor->const_data_ptr<int64_t>());
   EXPECT_EQ(cloned_tensor->const_data_ptr<int64_t>()[0], 100);
   EXPECT_EQ(cloned_tensor->const_data_ptr<int64_t>()[3], 400);
-  EXPECT_EQ(cloned_tensor->scalar_type(), exec_aten::ScalarType::Long);
+  EXPECT_EQ(cloned_tensor->scalar_type(), executorch::aten::ScalarType::Long);
 }
 
 TEST_F(TensorPtrTest, CloneTensorPtrFromTensorPtrNull) {
@@ -493,12 +493,12 @@ TEST_F(TensorPtrTest, CloneTensorPtrFromTensorPtrNull) {
 TEST_F(TensorPtrTest, TensorDataCastingFromIntToFloat) {
   std::vector<int32_t> int_data = {1, 2, 3, 4, 5, 6};
   auto tensor = make_tensor_ptr(
-      {2, 3}, std::move(int_data), {}, {}, exec_aten::ScalarType::Float);
+      {2, 3}, std::move(int_data), {}, {}, executorch::aten::ScalarType::Float);
 
   EXPECT_EQ(tensor->dim(), 2);
   EXPECT_EQ(tensor->size(0), 2);
   EXPECT_EQ(tensor->size(1), 3);
-  EXPECT_EQ(tensor->scalar_type(), exec_aten::ScalarType::Float);
+  EXPECT_EQ(tensor->scalar_type(), executorch::aten::ScalarType::Float);
 
   auto data_ptr = tensor->const_data_ptr<float>();
   EXPECT_FLOAT_EQ(data_ptr[0], 1.0f);
@@ -507,12 +507,12 @@ TEST_F(TensorPtrTest, TensorDataCastingFromIntToFloat) {
 
 TEST_F(TensorPtrTest, TensorDataCastingFromIntToDouble) {
   std::vector<int32_t> int_data = {1, 2, 3};
-  auto tensor =
-      make_tensor_ptr(std::move(int_data), exec_aten::ScalarType::Double);
+  auto tensor = make_tensor_ptr(
+      std::move(int_data), executorch::aten::ScalarType::Double);
 
   EXPECT_EQ(tensor->dim(), 1);
   EXPECT_EQ(tensor->size(0), 3);
-  EXPECT_EQ(tensor->scalar_type(), exec_aten::ScalarType::Double);
+  EXPECT_EQ(tensor->scalar_type(), executorch::aten::ScalarType::Double);
 
   auto data_ptr = tensor->const_data_ptr<double>();
   EXPECT_DOUBLE_EQ(data_ptr[0], 1.0);
@@ -522,14 +522,14 @@ TEST_F(TensorPtrTest, TensorDataCastingFromIntToDouble) {
 
 TEST_F(TensorPtrTest, TensorDataCastingFromFloatToHalf) {
   std::vector<float> float_data = {1.0f, 2.0f, 3.0f};
-  auto tensor =
-      make_tensor_ptr(std::move(float_data), exec_aten::ScalarType::Half);
+  auto tensor = make_tensor_ptr(
+      std::move(float_data), executorch::aten::ScalarType::Half);
 
   EXPECT_EQ(tensor->dim(), 1);
   EXPECT_EQ(tensor->size(0), 3);
-  EXPECT_EQ(tensor->scalar_type(), exec_aten::ScalarType::Half);
+  EXPECT_EQ(tensor->scalar_type(), executorch::aten::ScalarType::Half);
 
-  auto data_ptr = tensor->const_data_ptr<exec_aten::Half>();
+  auto data_ptr = tensor->const_data_ptr<executorch::aten::Half>();
   EXPECT_EQ(static_cast<float>(data_ptr[0]), 1.0f);
   EXPECT_EQ(static_cast<float>(data_ptr[1]), 2.0f);
   EXPECT_EQ(static_cast<float>(data_ptr[2]), 3.0f);
@@ -537,12 +537,12 @@ TEST_F(TensorPtrTest, TensorDataCastingFromFloatToHalf) {
 
 TEST_F(TensorPtrTest, TensorDataCastingFromDoubleToFloat) {
   std::vector<double> double_data = {1.1, 2.2, 3.3};
-  auto tensor =
-      make_tensor_ptr(std::move(double_data), exec_aten::ScalarType::Float);
+  auto tensor = make_tensor_ptr(
+      std::move(double_data), executorch::aten::ScalarType::Float);
 
   EXPECT_EQ(tensor->dim(), 1);
   EXPECT_EQ(tensor->size(0), 3);
-  EXPECT_EQ(tensor->scalar_type(), exec_aten::ScalarType::Float);
+  EXPECT_EQ(tensor->scalar_type(), executorch::aten::ScalarType::Float);
 
   auto data_ptr = tensor->const_data_ptr<float>();
   EXPECT_FLOAT_EQ(data_ptr[0], 1.1f);
@@ -553,11 +553,11 @@ TEST_F(TensorPtrTest, TensorDataCastingFromDoubleToFloat) {
 TEST_F(TensorPtrTest, TensorDataCastingFromInt64ToInt32) {
   std::vector<int64_t> int64_data = {10000000000, 20000000000, 30000000000};
   auto tensor =
-      make_tensor_ptr(std::move(int64_data), exec_aten::ScalarType::Int);
+      make_tensor_ptr(std::move(int64_data), executorch::aten::ScalarType::Int);
 
   EXPECT_EQ(tensor->dim(), 1);
   EXPECT_EQ(tensor->size(0), 3);
-  EXPECT_EQ(tensor->scalar_type(), exec_aten::ScalarType::Int);
+  EXPECT_EQ(tensor->scalar_type(), executorch::aten::ScalarType::Int);
 
   auto data_ptr = tensor->const_data_ptr<int32_t>();
   EXPECT_NE(data_ptr[0], 10000000000); // Expected overflow
@@ -565,38 +565,38 @@ TEST_F(TensorPtrTest, TensorDataCastingFromInt64ToInt32) {
 
 TEST_F(TensorPtrTest, TensorDataCastingFromFloatToBFloat16) {
   std::vector<float> float_data = {1.0f, 2.0f, 3.0f};
-  auto tensor =
-      make_tensor_ptr(std::move(float_data), exec_aten::ScalarType::BFloat16);
+  auto tensor = make_tensor_ptr(
+      std::move(float_data), executorch::aten::ScalarType::BFloat16);
 
   EXPECT_EQ(tensor->dim(), 1);
   EXPECT_EQ(tensor->size(0), 3);
-  EXPECT_EQ(tensor->scalar_type(), exec_aten::ScalarType::BFloat16);
+  EXPECT_EQ(tensor->scalar_type(), executorch::aten::ScalarType::BFloat16);
 
-  auto data_ptr = tensor->const_data_ptr<exec_aten::BFloat16>();
+  auto data_ptr = tensor->const_data_ptr<executorch::aten::BFloat16>();
   EXPECT_EQ(static_cast<float>(data_ptr[0]), 1.0f);
   EXPECT_EQ(static_cast<float>(data_ptr[1]), 2.0f);
   EXPECT_EQ(static_cast<float>(data_ptr[2]), 3.0f);
 }
 
 TEST_F(TensorPtrTest, InitializerListDoubleToHalf) {
-  auto tensor =
-      make_tensor_ptr<double>({1.5, 2.7, 3.14}, exec_aten::ScalarType::Half);
+  auto tensor = make_tensor_ptr<double>(
+      {1.5, 2.7, 3.14}, executorch::aten::ScalarType::Half);
   EXPECT_EQ(tensor->dim(), 1);
   EXPECT_EQ(tensor->size(0), 3);
-  EXPECT_EQ(tensor->scalar_type(), exec_aten::ScalarType::Half);
+  EXPECT_EQ(tensor->scalar_type(), executorch::aten::ScalarType::Half);
 
-  auto data_ptr = tensor->const_data_ptr<exec_aten::Half>();
+  auto data_ptr = tensor->const_data_ptr<executorch::aten::Half>();
   EXPECT_NEAR(static_cast<float>(data_ptr[0]), 1.5f, 0.01);
   EXPECT_NEAR(static_cast<float>(data_ptr[1]), 2.7f, 0.01);
   EXPECT_NEAR(static_cast<float>(data_ptr[2]), 3.14f, 0.01);
 }
 
 TEST_F(TensorPtrTest, InitializerListInt8ToInt64) {
-  auto tensor =
-      make_tensor_ptr<int8_t>({1, -2, 3, -4}, exec_aten::ScalarType::Long);
+  auto tensor = make_tensor_ptr<int8_t>(
+      {1, -2, 3, -4}, executorch::aten::ScalarType::Long);
   EXPECT_EQ(tensor->dim(), 1);
   EXPECT_EQ(tensor->size(0), 4);
-  EXPECT_EQ(tensor->scalar_type(), exec_aten::ScalarType::Long);
+  EXPECT_EQ(tensor->scalar_type(), executorch::aten::ScalarType::Long);
 
   auto data_ptr = tensor->const_data_ptr<int64_t>();
   EXPECT_EQ(data_ptr[0], 1);
@@ -687,8 +687,8 @@ TEST_F(TensorPtrTest, TensorDataDeleterReleasesCapturedSharedPtr) {
       data_ptr.get(),
       {},
       {},
-      exec_aten::ScalarType::Float,
-      exec_aten::TensorShapeDynamism::DYNAMIC_BOUND,
+      executorch::aten::ScalarType::Float,
+      executorch::aten::TensorShapeDynamism::DYNAMIC_BOUND,
       [data_ptr, &deleter_called](void*) mutable { deleter_called = true; });
 
   EXPECT_EQ(data_ptr.use_count(), 2);
@@ -726,8 +726,8 @@ TEST_F(TensorPtrTest, CustomDeleterWithSharedData) {
         data->data(),
         {},
         {},
-        exec_aten::ScalarType::Float,
-        exec_aten::TensorShapeDynamism::DYNAMIC_BOUND,
+        executorch::aten::ScalarType::Float,
+        executorch::aten::TensorShapeDynamism::DYNAMIC_BOUND,
         [data, &deleter_called](void*) mutable {
           deleter_called = true;
           data.reset();
@@ -755,7 +755,7 @@ TEST_F(TensorPtrTest, TensorDeducedScalarType) {
 
 TEST_F(TensorPtrTest, TensorUint8BufferWithFloatScalarType) {
   std::vector<uint8_t> data(
-      4 * exec_aten::elementSize(exec_aten::ScalarType::Float));
+      4 * executorch::aten::elementSize(executorch::aten::ScalarType::Float));
 
   float* float_data = reinterpret_cast<float*>(data.data());
   float_data[0] = 1.0f;
@@ -779,14 +779,14 @@ TEST_F(TensorPtrTest, TensorUint8BufferWithFloatScalarType) {
 
 TEST_F(TensorPtrTest, TensorUint8BufferTooSmallExpectDeath) {
   std::vector<uint8_t> data(
-      2 * exec_aten::elementSize(exec_aten::ScalarType::Float));
+      2 * executorch::aten::elementSize(executorch::aten::ScalarType::Float));
   ET_EXPECT_DEATH(
       { auto tensor = make_tensor_ptr({2, 2}, std::move(data)); }, "");
 }
 
 TEST_F(TensorPtrTest, TensorUint8BufferTooLarge) {
   std::vector<uint8_t> data(
-      4 * exec_aten::elementSize(exec_aten::ScalarType::Float));
+      4 * executorch::aten::elementSize(executorch::aten::ScalarType::Float));
   auto tensor = make_tensor_ptr({2, 2}, std::move(data));
 
   EXPECT_EQ(tensor->dim(), 2);
@@ -806,8 +806,8 @@ TEST_F(TensorPtrTest, TensorDataCastingInvalidCast) {
   std::vector<float> float_data = {1.0f, 2.0f, 3.0f};
   ET_EXPECT_DEATH(
       {
-        auto _ =
-            make_tensor_ptr(std::move(float_data), exec_aten::ScalarType::Int);
+        auto _ = make_tensor_ptr(
+            std::move(float_data), executorch::aten::ScalarType::Int);
       },
       "");
 }
diff --git a/extension/training/examples/XOR/targets.bzl b/extension/training/examples/XOR/targets.bzl
index 26d0f40d90..4a85c34c1b 100644
--- a/extension/training/examples/XOR/targets.bzl
+++ b/extension/training/examples/XOR/targets.bzl
@@ -17,6 +17,7 @@ def define_common_targets():
             "//executorch/runtime/executor:program",
             "//executorch/extension/data_loader:file_data_loader",
             "//executorch/kernels/portable:generated_lib",
+            "//executorch/extension/flat_tensor/serialize:serialize_cpp"
         ],
         external_deps = ["gflags"],
         define_static_target = True,
diff --git a/extension/training/examples/XOR/test/test_export.py b/extension/training/examples/XOR/test/test_export.py
index 26a24607d9..82c9087e84 100644
--- a/extension/training/examples/XOR/test/test_export.py
+++ b/extension/training/examples/XOR/test/test_export.py
@@ -13,6 +13,7 @@
 
 class TestXORExport(unittest.TestCase):
     def test(self):
-        _ = _export_model()
+        ep = _export_model()
+        self.assertTrue(ep is not None)
         # Expect that we reach this far without an exception being thrown.
         self.assertTrue(True)
diff --git a/extension/training/examples/XOR/train.cpp b/extension/training/examples/XOR/train.cpp
index bca433fd88..746daebbf1 100644
--- a/extension/training/examples/XOR/train.cpp
+++ b/extension/training/examples/XOR/train.cpp
@@ -7,6 +7,7 @@
  */
 
 #include <executorch/extension/data_loader/file_data_loader.h>
+#include <executorch/extension/flat_tensor/serialize/serialize.h>
 #include <executorch/extension/tensor/tensor.h>
 #include <executorch/extension/training/module/training_module.h>
 #include <executorch/extension/training/optimizer/sgd.h>
@@ -105,4 +106,11 @@ int main(int argc, char** argv) {
     }
     optimizer.step(mod.named_gradients("forward").get());
   }
+  std::map<std::string, exec_aten::Tensor> param_map;
+  for (auto& param : param_res.get()) {
+    param_map.insert(std::pair<std::string, exec_aten::Tensor>{
+        std::string(param.first.data()), param.second});
+  }
+
+  executorch::extension::flat_tensor::save_ptd("xor.ptd", param_map, 16);
 }
diff --git a/extension/training/module/test/training_module_test.cpp b/extension/training/module/test/training_module_test.cpp
index 58631c4cf4..ccd1c99555 100644
--- a/extension/training/module/test/training_module_test.cpp
+++ b/extension/training/module/test/training_module_test.cpp
@@ -16,8 +16,8 @@
 // @lint-ignore-every CLANGTIDY facebook-hte-CArray
 
 using namespace ::testing;
-using exec_aten::ScalarType;
-using exec_aten::Tensor;
+using executorch::aten::ScalarType;
+using executorch::aten::Tensor;
 using torch::executor::Error;
 using torch::executor::Span;
 using torch::executor::testing::TensorFactory;
diff --git a/extension/training/module/training_module.cpp b/extension/training/module/training_module.cpp
index 28128552e2..52d293c69e 100644
--- a/extension/training/module/training_module.cpp
+++ b/extension/training/module/training_module.cpp
@@ -76,7 +76,7 @@ TrainingModule::execute_forward_backward(
     size_t name_index = 0;
     for (size_t grad_index = grad_start; grad_index < param_start;
          ++grad_index, ++name_index) {
-      exec_aten::string_view fqn = fqn_list.at(name_index).toString();
+      executorch::aten::string_view fqn = fqn_list.at(name_index).toString();
       gradients_map.insert({fqn, outputs.get().at(grad_index).toTensor()});
     }
   }
@@ -84,9 +84,11 @@ TrainingModule::execute_forward_backward(
   return user_outputs;
 }
 
-runtime::Result<const std::map<exec_aten::string_view, exec_aten::Tensor>>
+runtime::Result<
+    const std::map<executorch::aten::string_view, executorch::aten::Tensor>>
 TrainingModule::named_parameters(const std::string& method_name) {
-  std::map<exec_aten::string_view, exec_aten::Tensor> named_parameters;
+  std::map<executorch::aten::string_view, executorch::aten::Tensor>
+      named_parameters;
   const std::string fqn_method_name = fqn_method_prefix + method_name;
   const std::string parameters_method_name =
       parameters_method_prefix + method_name;
@@ -117,14 +119,15 @@ TrainingModule::named_parameters(const std::string& method_name) {
   size_t name_index = 0;
   for (size_t param_index = param_start; param_index < method->outputs_size();
        ++param_index, ++name_index) {
-    exec_aten::string_view fqn = fqn_list.at(name_index).toString();
-    exec_aten::Tensor param = method->get_output(param_index).toTensor();
+    executorch::aten::string_view fqn = fqn_list.at(name_index).toString();
+    executorch::aten::Tensor param = method->get_output(param_index).toTensor();
     named_parameters.insert({fqn, param});
   }
   return named_parameters;
 }
 
-runtime::Result<const std::map<exec_aten::string_view, exec_aten::Tensor>>
+runtime::Result<
+    const std::map<executorch::aten::string_view, executorch::aten::Tensor>>
 TrainingModule::named_gradients(const std::string& method_name) {
   if (method_named_gradients_.find(method_name) ==
       method_named_gradients_.end()) {
diff --git a/extension/training/optimizer/sgd.cpp b/extension/training/optimizer/sgd.cpp
index 383383abc3..1e7cae70ab 100644
--- a/extension/training/optimizer/sgd.cpp
+++ b/extension/training/optimizer/sgd.cpp
@@ -10,8 +10,8 @@
 
 #include <executorch/runtime/core/error.h>
 
-using exec_aten::Tensor;
-using exec_aten::TensorImpl;
+using executorch::aten::Tensor;
+using executorch::aten::TensorImpl;
 using ::executorch::runtime::Error;
 
 namespace executorch {
@@ -66,7 +66,7 @@ void SGDParamGroup::set_options(std::unique_ptr<SGDOptions> options) {
   options_ = std::move(options);
 }
 
-const std::map<exec_aten::string_view, exec_aten::Tensor>&
+const std::map<executorch::aten::string_view, executorch::aten::Tensor>&
 SGDParamGroup::named_parameters() const {
   return named_parameters_;
 }
@@ -81,8 +81,9 @@ void SGD::add_param_group(const SGDParamGroup& param_group) {
   param_groups_.emplace_back(std::move(param_group_));
 }
 
-Error SGD::step(const std::map<exec_aten::string_view, exec_aten::Tensor>&
-                    named_gradients) {
+Error SGD::step(
+    const std::map<executorch::aten::string_view, executorch::aten::Tensor>&
+        named_gradients) {
   for (auto& group : param_groups_) {
     auto& options = static_cast<SGDOptions&>(group.options());
     auto weight_decay = options.weight_decay();
diff --git a/extension/training/optimizer/test/sgd_test.cpp b/extension/training/optimizer/test/sgd_test.cpp
index 92b329d8ed..51aad9ee2d 100644
--- a/extension/training/optimizer/test/sgd_test.cpp
+++ b/extension/training/optimizer/test/sgd_test.cpp
@@ -17,8 +17,8 @@
 // @lint-ignore-every CLANGTIDY facebook-hte-CArray
 
 using namespace ::testing;
-using exec_aten::ScalarType;
-using exec_aten::Tensor;
+using executorch::aten::ScalarType;
+using executorch::aten::Tensor;
 using ::executorch::extension::training::optimizer::SGD;
 using ::executorch::extension::training::optimizer::SGDOptions;
 using ::executorch::extension::training::optimizer::SGDParamState;
@@ -68,8 +68,10 @@ TEST_F(SGDOptimizerTest, SGDOptionsDefaultValuesTest) {
 TEST_F(SGDOptimizerTest, SGDOptimizerSimple) {
   TensorFactory<ScalarType::Float> tf;
 
-  std::map<exec_aten::string_view, exec_aten::Tensor> named_parameters;
-  std::map<exec_aten::string_view, exec_aten::Tensor> named_gradients;
+  std::map<executorch::aten::string_view, executorch::aten::Tensor>
+      named_parameters;
+  std::map<executorch::aten::string_view, executorch::aten::Tensor>
+      named_gradients;
 
   named_parameters.insert({"param1", tf.make({1, 1}, {1})});
 
@@ -90,7 +92,8 @@ TEST_F(SGDOptimizerTest, SGDOptimizerSimple) {
 TEST_F(SGDOptimizerTest, SGDOptimizerComplex) {
   TensorFactory<ScalarType::Float> tf;
 
-  std::map<exec_aten::string_view, exec_aten::Tensor> named_parameters;
+  std::map<executorch::aten::string_view, executorch::aten::Tensor>
+      named_parameters;
 
   named_parameters.insert({"param1", tf.make({1, 1}, {1.0})});
   named_parameters.insert({"param2", tf.make({1, 1}, {2.0})});
@@ -98,7 +101,8 @@ TEST_F(SGDOptimizerTest, SGDOptimizerComplex) {
   SGD optimizer(named_parameters, SGDOptions{0.1, 0.1, 0, 2, true});
 
   for (int i = 0; i < 10; ++i) {
-    std::map<exec_aten::string_view, exec_aten::Tensor> named_gradients;
+    std::map<executorch::aten::string_view, executorch::aten::Tensor>
+        named_gradients;
     // dummy gradient of -1 for all epochs
     named_gradients.insert({"param1", tf.make({1, 1}, {-1})});
     named_gradients.insert({"param2", tf.make({1, 1}, {-1})});
diff --git a/extension/training/pybindings/_training_lib.cpp b/extension/training/pybindings/_training_lib.cpp
index 59cd11be4a..d37778f530 100644
--- a/extension/training/pybindings/_training_lib.cpp
+++ b/extension/training/pybindings/_training_lib.cpp
@@ -42,7 +42,8 @@ struct PySGD final {
         params_()
 #endif
   {
-    std::map<exec_aten::string_view, exec_aten::Tensor> cpp_inputs;
+    std::map<executorch::aten::string_view, executorch::aten::Tensor>
+        cpp_inputs;
     auto py_named_params =
         py::cast<std::unordered_map<std::string, at::Tensor>>(named_params);
     const auto params_size = py::len(named_params);
@@ -51,7 +52,7 @@ struct PySGD final {
 
     for (auto pair : py_named_params) {
       fqns_.push_back(pair.first);
-      exec_aten::string_view v{fqns_.back().c_str(), pair.first.size()};
+      executorch::aten::string_view v{fqns_.back().c_str(), pair.first.size()};
 #ifndef USE_ATEN_LIB
       // convert at::Tensor to torch::executor::Tensor
       params_.emplace_back(alias_tensor_ptr_to_attensor(pair.second));
@@ -75,7 +76,8 @@ struct PySGD final {
   void step(const py::dict& py_dict) {
     auto py_named_gradients =
         py::cast<std::unordered_map<std::string, at::Tensor>>(py_dict);
-    std::map<exec_aten::string_view, exec_aten::Tensor> cpp_inputs;
+    std::map<executorch::aten::string_view, executorch::aten::Tensor>
+        cpp_inputs;
 
     std::vector<std::string> fqn;
 #ifndef USE_ATEN_LIB
diff --git a/extension/training/test/training_loop_test.cpp b/extension/training/test/training_loop_test.cpp
index bc162ab26b..58923d6490 100644
--- a/extension/training/test/training_loop_test.cpp
+++ b/extension/training/test/training_loop_test.cpp
@@ -26,8 +26,8 @@
 using namespace ::testing;
 using namespace executorch::extension::training::optimizer;
 using namespace torch::executor::testing;
-using exec_aten::ScalarType;
-using exec_aten::Tensor;
+using executorch::aten::ScalarType;
+using executorch::aten::Tensor;
 using namespace torch::executor;
 using torch::executor::util::FileDataLoader;
 
diff --git a/install_executorch.py b/install_executorch.py
index 37ef3185ad..9279d9434e 100644
--- a/install_executorch.py
+++ b/install_executorch.py
@@ -32,7 +32,7 @@ def clean():
     print("Done cleaning build artifacts.")
 
 
-VALID_PYBINDS = ["coreml", "mps", "xnnpack"]
+VALID_PYBINDS = ["coreml", "mps", "xnnpack", "training"]
 
 
 def main(args):
@@ -78,8 +78,12 @@ def main(args):
                     raise Exception(
                         f"Unrecognized pybind argument {pybind_arg}; valid options are: {', '.join(VALID_PYBINDS)}"
                     )
+                if pybind_arg == "training":
+                    CMAKE_ARGS += " -DEXECUTORCH_BUILD_EXTENSION_TRAINING=ON"
+                    os.environ["EXECUTORCH_BUILD_TRAINING"] = "ON"
+                else:
+                    CMAKE_ARGS += f" -DEXECUTORCH_BUILD_{pybind_arg.upper()}=ON"
                 EXECUTORCH_BUILD_PYBIND = "ON"
-                CMAKE_ARGS += f" -DEXECUTORCH_BUILD_{pybind_arg.upper()}=ON"
 
     if args.clean:
         clean()
diff --git a/install_requirements.py b/install_requirements.py
index c634e48394..e0e9629db7 100644
--- a/install_requirements.py
+++ b/install_requirements.py
@@ -67,7 +67,7 @@ def python_is_compatible():
 # NOTE: If a newly-fetched version of the executorch repo changes the value of
 # NIGHTLY_VERSION, you should re-run this script to install the necessary
 # package versions.
-NIGHTLY_VERSION = "dev20250104"
+NIGHTLY_VERSION = "dev20250131"
 
 
 def install_requirements(use_pytorch_nightly):
@@ -76,7 +76,7 @@ def install_requirements(use_pytorch_nightly):
         # Setting use_pytorch_nightly to false to test the pinned PyTorch commit. Note
         # that we don't need to set any version number there because they have already
         # been installed on CI before this step, so pip won't reinstall them
-        f"torch==2.6.0.{NIGHTLY_VERSION}" if use_pytorch_nightly else "torch",
+        f"torch==2.7.0.{NIGHTLY_VERSION}" if use_pytorch_nightly else "torch",
         (
             f"torchvision==0.22.0.{NIGHTLY_VERSION}"
             if use_pytorch_nightly
diff --git a/kernels/README.md b/kernels/README.md
index 68b0ce222b..13243545ae 100644
--- a/kernels/README.md
+++ b/kernels/README.md
@@ -304,8 +304,8 @@ starting point:
 #include <gtest/gtest.h>
 
 using namespace ::testing;
-using exec_aten::ScalarType;
-using exec_aten::Tensor;
+using executorch::aten::ScalarType;
+using executorch::aten::Tensor;
 using torch::executor::native::<operator_function_name>;
 using torch::executor::testing::IsCloseTo;
 using torch::executor::testing::TensorFactory;
diff --git a/kernels/aten/cpu/op__empty_dim_order.cpp b/kernels/aten/cpu/op__empty_dim_order.cpp
index f11f853daa..e75963a9c4 100644
--- a/kernels/aten/cpu/op__empty_dim_order.cpp
+++ b/kernels/aten/cpu/op__empty_dim_order.cpp
@@ -16,10 +16,11 @@ namespace torch {
 namespace executor {
 namespace native {
 
-using exec_aten::IntArrayRef;
-using exec_aten::Tensor;
-using OptionalIntArrayRef = exec_aten::OptionalArrayRef<int64_t>;
-using DimOrderArrayRef = exec_aten::ArrayRef<executorch::aten::DimOrderType>;
+using executorch::aten::IntArrayRef;
+using executorch::aten::Tensor;
+using OptionalIntArrayRef = executorch::aten::OptionalArrayRef<int64_t>;
+using DimOrderArrayRef =
+    executorch::aten::ArrayRef<executorch::aten::DimOrderType>;
 // Out Aten tensor shall have same memory format stride as dim_order
 const size_t kMaxNumOfDimensions = 16;
 
@@ -28,18 +29,18 @@ namespace {
 inline bool _check__empty_out_dim_order(
     OptionalIntArrayRef dim_order,
     Tensor& out) {
-  exec_aten::ArrayRef<int64_t> dim_order_ref;
+  executorch::aten::ArrayRef<int64_t> dim_order_ref;
   std::vector<int64_t> dim_order_vec;
 
   if (dim_order.has_value()) {
     // out tensor's dim order shall equal to input dim order
-    dim_order_ref = exec_aten::ArrayRef<int64_t>(
+    dim_order_ref = executorch::aten::ArrayRef<int64_t>(
         dim_order.value().data(), dim_order.value().size());
   } else { // dim_order is not set, out tensor should be contiguous dim order
     for (int i = 0; i < out.dim(); i++) {
       dim_order_vec.push_back(i);
     }
-    dim_order_ref = exec_aten::ArrayRef<int64_t>(dim_order_vec);
+    dim_order_ref = executorch::aten::ArrayRef<int64_t>(dim_order_vec);
   }
 
   // dim order size shall equal to input dim
@@ -50,7 +51,7 @@ inline bool _check__empty_out_dim_order(
       is_contiguous_dim_order(dim_order_ref.data(), dim_order_ref.size()));
 
   ET_LOG_AND_RETURN_IF_FALSE(kMaxNumOfDimensions >= out.dim());
-  exec_aten::StridesType target_strides[kMaxNumOfDimensions];
+  executorch::aten::StridesType target_strides[kMaxNumOfDimensions];
   dim_order_to_stride_nocheck(
       out.sizes().data(),
       dim_order_ref.data(),
diff --git a/kernels/aten/cpu/op__to_dim_order_copy.cpp b/kernels/aten/cpu/op__to_dim_order_copy.cpp
index c2de14ea44..10793d24db 100644
--- a/kernels/aten/cpu/op__to_dim_order_copy.cpp
+++ b/kernels/aten/cpu/op__to_dim_order_copy.cpp
@@ -13,21 +13,22 @@ namespace torch {
 namespace executor {
 namespace native {
 
-using Tensor = exec_aten::Tensor;
-using SizesArrayRef = exec_aten::ArrayRef<exec_aten::SizesType>;
-using DimOrderArrayRef = exec_aten::ArrayRef<exec_aten::DimOrderType>;
-using MemoryFormat = exec_aten::MemoryFormat;
+using Tensor = executorch::aten::Tensor;
+using SizesArrayRef = executorch::aten::ArrayRef<executorch::aten::SizesType>;
+using DimOrderArrayRef =
+    executorch::aten::ArrayRef<executorch::aten::DimOrderType>;
+using MemoryFormat = executorch::aten::MemoryFormat;
 
 template <typename T>
-using OptionalArrayRef = exec_aten::OptionalArrayRef<T>;
+using OptionalArrayRef = executorch::aten::OptionalArrayRef<T>;
 
 template <typename T>
-using Optional = exec_aten::optional<T>;
+using Optional = executorch::aten::optional<T>;
 
 namespace {
 Optional<MemoryFormat> get_memory_format(OptionalArrayRef<int64_t> dim_order) {
   if (!dim_order.has_value()) {
-    return exec_aten::nullopt;
+    return executorch::aten::nullopt;
   }
   if (is_contiguous_dim_order(
           dim_order.value().data(), dim_order.value().size())) {
@@ -43,7 +44,7 @@ Optional<MemoryFormat> get_memory_format(OptionalArrayRef<int64_t> dim_order) {
 bool check__to_dim_order_copy_args(
     const Tensor& input,
     bool non_blocking,
-    exec_aten::OptionalArrayRef<int64_t> dim_order,
+    executorch::aten::OptionalArrayRef<int64_t> dim_order,
     Tensor& out) {
   // Right now we only support blocking data transfer
   ET_LOG_AND_RETURN_IF_FALSE(non_blocking == false);
@@ -51,7 +52,7 @@ bool check__to_dim_order_copy_args(
   // dim_order is set, the target dim_order will be either contiguous or
   // channels_last memory format
   if (dim_order.has_value()) {
-    exec_aten::ArrayRef<int64_t> dim_order_ref = dim_order.value();
+    executorch::aten::ArrayRef<int64_t> dim_order_ref = dim_order.value();
 
     // dim order size shall equal to input dim
     ET_LOG_AND_RETURN_IF_FALSE(dim_order_ref.size() == input.dim());
@@ -65,7 +66,7 @@ bool check__to_dim_order_copy_args(
     // Out Aten tensor shall have same memory format stride as dim_order
     const size_t kMaxNumOfDimensions = 16;
     ET_LOG_AND_RETURN_IF_FALSE(kMaxNumOfDimensions >= out.dim());
-    exec_aten::StridesType target_strides[kMaxNumOfDimensions];
+    executorch::aten::StridesType target_strides[kMaxNumOfDimensions];
     dim_order_to_stride_nocheck(
         out.sizes().data(),
         dim_order_ref.data(),
diff --git a/kernels/aten/cpu/util/copy_ops_util.cpp b/kernels/aten/cpu/util/copy_ops_util.cpp
index b20fe12786..0fe5342ca3 100644
--- a/kernels/aten/cpu/util/copy_ops_util.cpp
+++ b/kernels/aten/cpu/util/copy_ops_util.cpp
@@ -14,12 +14,12 @@
 namespace torch {
 namespace executor {
 
-using Tensor = exec_aten::Tensor;
+using Tensor = executorch::aten::Tensor;
 
 bool check__to_dim_order_copy_args(
     const Tensor& input,
     bool non_blocking,
-    exec_aten::OptionalArrayRef<int64_t> dim_order,
+    executorch::aten::OptionalArrayRef<int64_t> dim_order,
     Tensor& out) {
   // Right now we only support blocking data transfer
   ET_LOG_AND_RETURN_IF_FALSE(non_blocking == false);
@@ -27,7 +27,7 @@ bool check__to_dim_order_copy_args(
   // dim_order is set, the target dim_order will be either contiguous or
   // channels_last memory format
   if (dim_order.has_value()) {
-    exec_aten::ArrayRef<int64_t> dim_order_ref = dim_order.value();
+    executorch::aten::ArrayRef<int64_t> dim_order_ref = dim_order.value();
 
     // dim order size shall equal to input dim
     ET_LOG_AND_RETURN_IF_FALSE(dim_order_ref.size() == input.dim());
@@ -41,7 +41,7 @@ bool check__to_dim_order_copy_args(
     // Out Aten tensor shall have same memory format stride as dim_order
     const size_t kMaxNumOfDimensions = 16;
     ET_LOG_AND_RETURN_IF_FALSE(kMaxNumOfDimensions >= out.dim());
-    exec_aten::StridesType target_strides[kMaxNumOfDimensions];
+    executorch::aten::StridesType target_strides[kMaxNumOfDimensions];
     dim_order_to_stride_nocheck(
         out.sizes().data(),
         dim_order_ref.data(),
diff --git a/kernels/aten/cpu/util/copy_ops_util.h b/kernels/aten/cpu/util/copy_ops_util.h
index 6ddc737100..dd9c50123d 100644
--- a/kernels/aten/cpu/util/copy_ops_util.h
+++ b/kernels/aten/cpu/util/copy_ops_util.h
@@ -16,7 +16,7 @@ namespace executor {
 bool check__to_dim_order_copy_args(
     const Tensor& input,
     bool non_blocking,
-    exec_aten::OptionalArrayRef<int64_t> dim_order,
+    executorch::aten::OptionalArrayRef<int64_t> dim_order,
     Tensor& out);
 
 } // namespace executor
diff --git a/kernels/optimized/blas/CPUBlas.cpp b/kernels/optimized/blas/CPUBlas.cpp
index d30064b953..b948fb3548 100644
--- a/kernels/optimized/blas/CPUBlas.cpp
+++ b/kernels/optimized/blas/CPUBlas.cpp
@@ -24,8 +24,8 @@ extern "C" void sgemm_(char *transa, char *transb, int *m, int *n, int *k, float
 namespace executorch {
 namespace cpublas {
 
-using exec_aten::BFloat16;
-using exec_aten::Half;
+using executorch::aten::BFloat16;
+using executorch::aten::Half;
 
 #ifdef ET_BUILD_WITH_BLAS
 #ifdef ET_BUILD_FOR_APPLE
diff --git a/kernels/optimized/blas/CPUBlas.h b/kernels/optimized/blas/CPUBlas.h
index 89f0992e30..d8517255f6 100644
--- a/kernels/optimized/blas/CPUBlas.h
+++ b/kernels/optimized/blas/CPUBlas.h
@@ -97,20 +97,20 @@ void gemm(
 void gemm(
     TransposeType transa, TransposeType transb,
     int64_t m, int64_t n, int64_t k,
-    const exec_aten::Half alpha,
-    const exec_aten::Half *a, int64_t lda,
-    const exec_aten::Half *b, int64_t ldb,
-    const exec_aten::Half beta,
-    exec_aten::Half *c, int64_t ldc);
+    const executorch::aten::Half alpha,
+    const executorch::aten::Half *a, int64_t lda,
+    const executorch::aten::Half *b, int64_t ldb,
+    const executorch::aten::Half beta,
+    executorch::aten::Half *c, int64_t ldc);
 
 void gemm(
     TransposeType transa, TransposeType transb,
     int64_t m, int64_t n, int64_t k,
-    const exec_aten::BFloat16 alpha,
-    const exec_aten::BFloat16 *a, int64_t lda,
-    const exec_aten::BFloat16 *b, int64_t ldb,
-    const exec_aten::BFloat16 beta,
-    exec_aten::BFloat16 *c, int64_t ldc);
+    const executorch::aten::BFloat16 alpha,
+    const executorch::aten::BFloat16 *a, int64_t lda,
+    const executorch::aten::BFloat16 *b, int64_t ldb,
+    const executorch::aten::BFloat16 beta,
+    executorch::aten::BFloat16 *c, int64_t ldc);
 // clang-format on
 
 // clang-format off
diff --git a/kernels/optimized/cpu/op_add.cpp b/kernels/optimized/cpu/op_add.cpp
index 2b31a8d5db..f35c8bf594 100644
--- a/kernels/optimized/cpu/op_add.cpp
+++ b/kernels/optimized/cpu/op_add.cpp
@@ -67,8 +67,8 @@ struct AddInner<false, CTYPE_A, CTYPE_B, CTYPE_IN, CTYPE_OUT>
 
 } // namespace
 
-using Tensor = exec_aten::Tensor;
-using ScalarType = exec_aten::ScalarType;
+using Tensor = executorch::aten::Tensor;
+using ScalarType = executorch::aten::ScalarType;
 
 Tensor& opt_add_out(
     KernelRuntimeContext& ctx,
diff --git a/kernels/optimized/cpu/op_bmm.cpp b/kernels/optimized/cpu/op_bmm.cpp
index 86a318e725..21ae7dfca9 100644
--- a/kernels/optimized/cpu/op_bmm.cpp
+++ b/kernels/optimized/cpu/op_bmm.cpp
@@ -24,7 +24,7 @@ namespace torch {
 namespace executor {
 namespace native {
 
-using Tensor = exec_aten::Tensor;
+using Tensor = executorch::aten::Tensor;
 
 namespace {
 
@@ -111,7 +111,7 @@ void bmm_kernel(const Tensor& self, const Tensor& mat2, Tensor& out) {
 }
 
 Error resize_out_tensor(const Tensor& self, const Tensor& mat2, Tensor& out) {
-  exec_aten::SizesType expected_output_size[kTensorDimensionLimit];
+  executorch::aten::SizesType expected_output_size[kTensorDimensionLimit];
 
   const size_t m_dim = self.dim() - 2;
   const size_t n_dim = self.dim() - 1;
@@ -128,7 +128,7 @@ Error resize_out_tensor(const Tensor& self, const Tensor& mat2, Tensor& out) {
   expected_output_size[m_dim] = self.size(m_dim);
   expected_output_size[n_dim] = mat2.size(n_dim);
 
-  ArrayRef<exec_aten::SizesType> output_size{
+  ArrayRef<executorch::aten::SizesType> output_size{
       expected_output_size, static_cast<size_t>(out.dim())};
 
   return resize_tensor(out, output_size);
diff --git a/kernels/optimized/cpu/op_exp.cpp b/kernels/optimized/cpu/op_exp.cpp
index 8c234d3d1d..23fae5708e 100644
--- a/kernels/optimized/cpu/op_exp.cpp
+++ b/kernels/optimized/cpu/op_exp.cpp
@@ -27,8 +27,8 @@ template <
     typename CTYPE_OUT,
     typename std::enable_if<
         std::is_same_v<CTYPE_IN, CTYPE_OUT> &&
-            !std::is_same_v<CTYPE_IN, exec_aten::Half> &&
-            !std::is_same_v<CTYPE_OUT, exec_aten::BFloat16>,
+            !std::is_same_v<CTYPE_IN, executorch::aten::Half> &&
+            !std::is_same_v<CTYPE_OUT, executorch::aten::BFloat16>,
         int>::type = 0>
 void exp_data(
     const CTYPE_IN* in_data,
@@ -47,10 +47,10 @@ template <
     typename CTYPE_OUT,
     typename std::enable_if<
         !std::is_same_v<CTYPE_IN, CTYPE_OUT> ||
-            std::is_same_v<CTYPE_IN, exec_aten::Half> ||
-            std::is_same_v<CTYPE_IN, exec_aten::BFloat16> ||
-            std::is_same_v<CTYPE_OUT, exec_aten::Half> ||
-            std::is_same_v<CTYPE_OUT, exec_aten::BFloat16>,
+            std::is_same_v<CTYPE_IN, executorch::aten::Half> ||
+            std::is_same_v<CTYPE_IN, executorch::aten::BFloat16> ||
+            std::is_same_v<CTYPE_OUT, executorch::aten::Half> ||
+            std::is_same_v<CTYPE_OUT, executorch::aten::BFloat16>,
         int>::type = 0>
 void exp_data(
     const CTYPE_IN* in_data,
diff --git a/kernels/optimized/cpu/op_gelu.cpp b/kernels/optimized/cpu/op_gelu.cpp
index 1b021c873c..8859132339 100644
--- a/kernels/optimized/cpu/op_gelu.cpp
+++ b/kernels/optimized/cpu/op_gelu.cpp
@@ -21,9 +21,9 @@ namespace torch {
 namespace executor {
 namespace native {
 
-using Tensor = exec_aten::Tensor;
-using ScalarType = exec_aten::ScalarType;
-using string_view = exec_aten::string_view;
+using Tensor = executorch::aten::Tensor;
+using ScalarType = executorch::aten::ScalarType;
+using string_view = executorch::aten::string_view;
 
 namespace {
 
@@ -69,7 +69,7 @@ void gelu(
     }
 #else
     size_t i = 0;
-    if (std::is_same<CTYPE, float>::value) {
+    if constexpr (std::is_same_v<CTYPE, float>) {
       for (; i + 4 < lim; i += 4) {
         const float32x4_t in =
             vld1q_f32(static_cast<const float*>(&in_data[i]));
diff --git a/kernels/optimized/cpu/op_le.cpp b/kernels/optimized/cpu/op_le.cpp
index 3f4a1fec8b..ccacdd5b27 100644
--- a/kernels/optimized/cpu/op_le.cpp
+++ b/kernels/optimized/cpu/op_le.cpp
@@ -16,8 +16,8 @@ namespace torch {
 namespace executor {
 namespace native {
 
-using Tensor = exec_aten::Tensor;
-using ScalarType = exec_aten::ScalarType;
+using Tensor = executorch::aten::Tensor;
+using ScalarType = executorch::aten::ScalarType;
 
 Tensor& opt_le_tensor_out(
     KernelRuntimeContext& ctx,
diff --git a/kernels/optimized/cpu/op_linear.cpp b/kernels/optimized/cpu/op_linear.cpp
index 56634d326f..210000b384 100644
--- a/kernels/optimized/cpu/op_linear.cpp
+++ b/kernels/optimized/cpu/op_linear.cpp
@@ -16,7 +16,7 @@ namespace torch {
 namespace executor {
 namespace native {
 
-using Tensor = exec_aten::Tensor;
+using Tensor = executorch::aten::Tensor;
 
 Tensor& opt_linear_out(
     RuntimeContext& ctx,
@@ -33,7 +33,7 @@ Tensor& opt_linear_out(
   ET_KERNEL_CHECK(ctx, check_linear_args(in, mat2, out), InvalidArgument, out);
 
   size_t output_ndim = 0;
-  std::array<exec_aten::SizesType, kTensorDimensionLimit> output_sizes;
+  std::array<executorch::aten::SizesType, kTensorDimensionLimit> output_sizes;
   get_linear_out_target_size(in, mat2, output_sizes.data(), &output_ndim);
   ET_KERNEL_CHECK(
       ctx,
diff --git a/kernels/optimized/cpu/op_log_softmax.cpp b/kernels/optimized/cpu/op_log_softmax.cpp
index 362cd10bee..c3f090a6df 100644
--- a/kernels/optimized/cpu/op_log_softmax.cpp
+++ b/kernels/optimized/cpu/op_log_softmax.cpp
@@ -25,7 +25,7 @@ namespace torch {
 namespace executor {
 namespace native {
 
-using Tensor = exec_aten::Tensor;
+using Tensor = executorch::aten::Tensor;
 namespace {
 
 template <typename IN_T, typename OUT_T>
diff --git a/kernels/optimized/cpu/op_mm.cpp b/kernels/optimized/cpu/op_mm.cpp
index 9131356aeb..53385a40df 100644
--- a/kernels/optimized/cpu/op_mm.cpp
+++ b/kernels/optimized/cpu/op_mm.cpp
@@ -16,7 +16,7 @@ namespace torch {
 namespace executor {
 namespace native {
 
-using Tensor = exec_aten::Tensor;
+using Tensor = executorch::aten::Tensor;
 
 Tensor& opt_mm_out(
     RuntimeContext& ctx,
@@ -26,7 +26,7 @@ Tensor& opt_mm_out(
   ET_KERNEL_CHECK(ctx, check_mm_args(in, mat2, out), InvalidArgument, out);
 
   size_t output_ndim = 0;
-  std::array<exec_aten::SizesType, kTensorDimensionLimit> output_sizes;
+  std::array<executorch::aten::SizesType, kTensorDimensionLimit> output_sizes;
   get_mm_out_target_size(in, mat2, output_sizes.data(), &output_ndim);
   ET_KERNEL_CHECK(
       ctx,
diff --git a/kernels/optimized/cpu/op_mul.cpp b/kernels/optimized/cpu/op_mul.cpp
index 7feee4e156..c8e6ba7a99 100644
--- a/kernels/optimized/cpu/op_mul.cpp
+++ b/kernels/optimized/cpu/op_mul.cpp
@@ -19,8 +19,8 @@ namespace torch {
 namespace executor {
 namespace native {
 
-using Tensor = exec_aten::Tensor;
-using ScalarType = exec_aten::ScalarType;
+using Tensor = executorch::aten::Tensor;
+using ScalarType = executorch::aten::ScalarType;
 
 namespace {
 
diff --git a/kernels/optimized/cpu/op_native_layer_norm.cpp b/kernels/optimized/cpu/op_native_layer_norm.cpp
index 3bbb37708e..9d8c069a42 100644
--- a/kernels/optimized/cpu/op_native_layer_norm.cpp
+++ b/kernels/optimized/cpu/op_native_layer_norm.cpp
@@ -19,7 +19,7 @@ namespace torch {
 namespace executor {
 namespace native {
 
-using Tensor = exec_aten::Tensor;
+using Tensor = executorch::aten::Tensor;
 
 namespace {
 
@@ -115,8 +115,8 @@ std::tuple<Tensor&, Tensor&, Tensor&> opt_native_layer_norm_out(
     KernelRuntimeContext& ctx,
     const Tensor& input,
     IntArrayRef normalized_shape,
-    const exec_aten::optional<Tensor>& weight,
-    const exec_aten::optional<Tensor>& bias,
+    const executorch::aten::optional<Tensor>& weight,
+    const executorch::aten::optional<Tensor>& bias,
     double eps,
     Tensor& out,
     Tensor& mean_out,
diff --git a/kernels/optimized/cpu/op_sigmoid.cpp b/kernels/optimized/cpu/op_sigmoid.cpp
index 751038cc8b..d2fd9f3c3f 100644
--- a/kernels/optimized/cpu/op_sigmoid.cpp
+++ b/kernels/optimized/cpu/op_sigmoid.cpp
@@ -19,8 +19,8 @@ namespace native {
 namespace {
 
 template <typename T>
-constexpr bool is_half_or_bf16_v = std::is_same_v<T, exec_aten::Half> ||
-    std::is_same_v<T, exec_aten::BFloat16>;
+constexpr bool is_half_or_bf16_v = std::is_same_v<T, executorch::aten::Half> ||
+    std::is_same_v<T, executorch::aten::BFloat16>;
 
 template <
     typename CTYPE_IN,
@@ -63,7 +63,7 @@ void sigmoid_data(
 
 } // namespace
 
-using Tensor = exec_aten::Tensor;
+using Tensor = executorch::aten::Tensor;
 
 Tensor&
 opt_sigmoid_out(KernelRuntimeContext& ctx, const Tensor& in, Tensor& out) {
diff --git a/kernels/optimized/cpu/op_sub.cpp b/kernels/optimized/cpu/op_sub.cpp
index 51ff4fbd57..7ee880d997 100644
--- a/kernels/optimized/cpu/op_sub.cpp
+++ b/kernels/optimized/cpu/op_sub.cpp
@@ -68,8 +68,8 @@ struct SubInner<false, CTYPE_A, CTYPE_B, CTYPE_IN, CTYPE_OUT>
 
 } // namespace
 
-using Tensor = exec_aten::Tensor;
-using ScalarType = exec_aten::ScalarType;
+using Tensor = executorch::aten::Tensor;
+using ScalarType = executorch::aten::ScalarType;
 
 Tensor& opt_sub_out(
     KernelRuntimeContext& ctx,
diff --git a/kernels/optimized/test/libblas_test.cpp b/kernels/optimized/test/libblas_test.cpp
index 24aeaba776..cb4d64d20c 100644
--- a/kernels/optimized/test/libblas_test.cpp
+++ b/kernels/optimized/test/libblas_test.cpp
@@ -19,7 +19,7 @@
   _<int64_t, N>();                         \
   _<uint8_t, N>();                         \
   _<int32_t, N>();                         \
-  _<exec_aten::BFloat16, N>();
+  _<executorch::aten::BFloat16, N>();
 
 namespace {
 
diff --git a/kernels/portable/cpu/op__empty_dim_order.cpp b/kernels/portable/cpu/op__empty_dim_order.cpp
index a4d733662f..59b791d611 100644
--- a/kernels/portable/cpu/op__empty_dim_order.cpp
+++ b/kernels/portable/cpu/op__empty_dim_order.cpp
@@ -16,9 +16,10 @@ namespace torch {
 namespace executor {
 namespace native {
 
-using exec_aten::Tensor;
-using OptionalIntArrayRef = exec_aten::OptionalArrayRef<int64_t>;
-using DimOrderArrayRef = exec_aten::ArrayRef<executorch::aten::DimOrderType>;
+using executorch::aten::Tensor;
+using OptionalIntArrayRef = executorch::aten::OptionalArrayRef<int64_t>;
+using DimOrderArrayRef =
+    executorch::aten::ArrayRef<executorch::aten::DimOrderType>;
 
 namespace {
 
diff --git a/kernels/portable/cpu/op__to_dim_order_copy.cpp b/kernels/portable/cpu/op__to_dim_order_copy.cpp
index bcbf6cc132..efb74e3a01 100644
--- a/kernels/portable/cpu/op__to_dim_order_copy.cpp
+++ b/kernels/portable/cpu/op__to_dim_order_copy.cpp
@@ -15,16 +15,17 @@ namespace torch {
 namespace executor {
 namespace native {
 
-using Tensor = exec_aten::Tensor;
-using SizesArrayRef = exec_aten::ArrayRef<exec_aten::SizesType>;
-using DimOrderArrayRef = exec_aten::ArrayRef<exec_aten::DimOrderType>;
-using MemoryFormat = exec_aten::MemoryFormat;
+using Tensor = executorch::aten::Tensor;
+using SizesArrayRef = executorch::aten::ArrayRef<executorch::aten::SizesType>;
+using DimOrderArrayRef =
+    executorch::aten::ArrayRef<executorch::aten::DimOrderType>;
+using MemoryFormat = executorch::aten::MemoryFormat;
 
 template <typename T>
-using OptionalArrayRef = exec_aten::OptionalArrayRef<T>;
+using OptionalArrayRef = executorch::aten::OptionalArrayRef<T>;
 
 template <typename T>
-using Optional = exec_aten::optional<T>;
+using Optional = executorch::aten::optional<T>;
 
 namespace {
 
@@ -34,7 +35,7 @@ int64_t coordinateToIndexWithDimOrder(
     const Tensor& self,
     const size_t* cur_indices) {
   int64_t index = 0;
-  exec_aten::StridesType strides[kTensorDimensionLimit];
+  executorch::aten::StridesType strides[kTensorDimensionLimit];
   SizesArrayRef sizes = self.sizes();
   DimOrderArrayRef dim_order = self.dim_order();
 
diff --git a/kernels/portable/cpu/op_abs.cpp b/kernels/portable/cpu/op_abs.cpp
index df530bcd1f..2f45037bce 100644
--- a/kernels/portable/cpu/op_abs.cpp
+++ b/kernels/portable/cpu/op_abs.cpp
@@ -14,7 +14,7 @@ namespace torch {
 namespace executor {
 namespace native {
 
-using exec_aten::Tensor;
+using executorch::aten::Tensor;
 
 Tensor& abs_out(KernelRuntimeContext& ctx, const Tensor& in, Tensor& out) {
   (void)ctx;
@@ -27,23 +27,48 @@ Tensor& abs_out(KernelRuntimeContext& ctx, const Tensor& in, Tensor& out) {
       out,
       "Failed to resize output tensor.");
 
-  ET_KERNEL_CHECK(ctx, tensors_have_same_dtype(in, out), InvalidArgument, out);
+  const bool in_is_complex =
+      executorch::runtime::isComplexType(in.scalar_type());
+  ET_KERNEL_CHECK(
+      ctx,
+      in_is_complex || tensors_have_same_dtype(in, out),
+      InvalidArgument,
+      out);
   ET_KERNEL_CHECK(
       ctx, tensors_have_same_dim_order(in, out), InvalidArgument, out);
 
-  ET_SWITCH_REALHBF16_TYPES(in.scalar_type(), ctx, "abs.out", CTYPE, [&] {
-    apply_unary_map_fn(
-        [](const CTYPE val_in) {
-          if (val_in < 0) {
-            return static_cast<CTYPE>(-val_in);
-          } else {
-            return static_cast<CTYPE>(val_in);
-          }
-        },
-        in.const_data_ptr<CTYPE>(),
-        out.mutable_data_ptr<CTYPE>(),
-        in.numel());
-  });
+  if (in_is_complex) {
+    // NOTE: Elected not to add COMPLEXH to dtype_util.h for now
+    // because I am not planning wide rollout of complex support; if
+    // we do add SupportedTensorDtypes::COMPLEXH support, then we
+    // should use it here.
+    ET_SWITCH_COMPLEXH_TYPES(in.scalar_type(), ctx, "abs.out", CTYPE_IN, [&] {
+      ET_SWITCH_FLOATH_TYPES(out.scalar_type(), ctx, "abs.out", CTYPE_OUT, [&] {
+        apply_unary_map_fn<CTYPE_IN, CTYPE_OUT>(
+            [](const CTYPE_IN val_in) -> CTYPE_OUT {
+              return sqrt(
+                  val_in.real_ * val_in.real_ + val_in.imag_ * val_in.imag_);
+            },
+            in.const_data_ptr<CTYPE_IN>(),
+            out.mutable_data_ptr<CTYPE_OUT>(),
+            in.numel());
+      });
+    });
+  } else {
+    ET_SWITCH_REALHBF16_TYPES(in.scalar_type(), ctx, "abs.out", CTYPE, [&] {
+      apply_unary_map_fn(
+          [](const CTYPE val_in) {
+            if (val_in < 0) {
+              return static_cast<CTYPE>(-val_in);
+            } else {
+              return static_cast<CTYPE>(val_in);
+            }
+          },
+          in.const_data_ptr<CTYPE>(),
+          out.mutable_data_ptr<CTYPE>(),
+          in.numel());
+    });
+  }
 
   return out;
 }
diff --git a/kernels/portable/cpu/op_addmm.cpp b/kernels/portable/cpu/op_addmm.cpp
index 9642b4f377..d1df5818cd 100644
--- a/kernels/portable/cpu/op_addmm.cpp
+++ b/kernels/portable/cpu/op_addmm.cpp
@@ -16,8 +16,8 @@ namespace torch {
 namespace executor {
 namespace native {
 
-using Tensor = exec_aten::Tensor;
-using Scalar = exec_aten::Scalar;
+using Tensor = executorch::aten::Tensor;
+using Scalar = executorch::aten::Scalar;
 
 Tensor& addmm_out(
     KernelRuntimeContext& ctx,
@@ -34,7 +34,7 @@ Tensor& addmm_out(
       out);
 
   size_t output_ndim = 0;
-  exec_aten::SizesType output_sizes[kTensorDimensionLimit];
+  executorch::aten::SizesType output_sizes[kTensorDimensionLimit];
   get_mm_out_target_size(mat1, mat2, output_sizes, &output_ndim);
   ET_KERNEL_CHECK(
       ctx,
diff --git a/kernels/portable/cpu/op_alias_copy.cpp b/kernels/portable/cpu/op_alias_copy.cpp
index a532b47c25..537f173209 100644
--- a/kernels/portable/cpu/op_alias_copy.cpp
+++ b/kernels/portable/cpu/op_alias_copy.cpp
@@ -14,7 +14,7 @@ namespace torch {
 namespace executor {
 namespace native {
 
-using Tensor = exec_aten::Tensor;
+using Tensor = executorch::aten::Tensor;
 
 Tensor&
 alias_copy_out(KernelRuntimeContext& ctx, const Tensor& in, Tensor& out) {
diff --git a/kernels/portable/cpu/op_allclose.cpp b/kernels/portable/cpu/op_allclose.cpp
index 3784fbffc5..51fd9ebb65 100644
--- a/kernels/portable/cpu/op_allclose.cpp
+++ b/kernels/portable/cpu/op_allclose.cpp
@@ -14,9 +14,9 @@
 namespace torch {
 namespace executor {
 namespace native {
-using Tensor = exec_aten::Tensor;
-using ScalarType = exec_aten::ScalarType;
-using Scalar = exec_aten::Scalar;
+using Tensor = executorch::aten::Tensor;
+using ScalarType = executorch::aten::ScalarType;
+using Scalar = executorch::aten::Scalar;
 namespace {
 
 /**
diff --git a/kernels/portable/cpu/op_amax.cpp b/kernels/portable/cpu/op_amax.cpp
index 519aa8ac92..9f879179ec 100644
--- a/kernels/portable/cpu/op_amax.cpp
+++ b/kernels/portable/cpu/op_amax.cpp
@@ -16,8 +16,8 @@ namespace torch {
 namespace executor {
 namespace native {
 
-using Tensor = exec_aten::Tensor;
-using ScalarType = exec_aten::ScalarType;
+using Tensor = executorch::aten::Tensor;
+using ScalarType = executorch::aten::ScalarType;
 
 Tensor& amax_out(
     KernelRuntimeContext& ctx,
diff --git a/kernels/portable/cpu/op_amin.cpp b/kernels/portable/cpu/op_amin.cpp
index 0f77980a50..4f6f3ce52e 100644
--- a/kernels/portable/cpu/op_amin.cpp
+++ b/kernels/portable/cpu/op_amin.cpp
@@ -16,8 +16,8 @@ namespace torch {
 namespace executor {
 namespace native {
 
-using Tensor = exec_aten::Tensor;
-using ScalarType = exec_aten::ScalarType;
+using Tensor = executorch::aten::Tensor;
+using ScalarType = executorch::aten::ScalarType;
 
 Tensor& amin_out(
     KernelRuntimeContext& ctx,
diff --git a/kernels/portable/cpu/op_any.cpp b/kernels/portable/cpu/op_any.cpp
index 2168859582..ef09e4837a 100644
--- a/kernels/portable/cpu/op_any.cpp
+++ b/kernels/portable/cpu/op_any.cpp
@@ -13,8 +13,8 @@ namespace torch {
 namespace executor {
 namespace native {
 
-using Tensor = exec_aten::Tensor;
-using ScalarType = exec_aten::ScalarType;
+using Tensor = executorch::aten::Tensor;
+using ScalarType = executorch::aten::ScalarType;
 
 Tensor& any_all_out(KernelRuntimeContext& ctx, const Tensor& in, Tensor& out) {
   (void)ctx;
diff --git a/kernels/portable/cpu/op_argmax.cpp b/kernels/portable/cpu/op_argmax.cpp
index a1fe14b857..5eb656d5b7 100644
--- a/kernels/portable/cpu/op_argmax.cpp
+++ b/kernels/portable/cpu/op_argmax.cpp
@@ -17,8 +17,8 @@ namespace torch {
 namespace executor {
 namespace native {
 
-using exec_aten::optional;
-using exec_aten::Tensor;
+using executorch::aten::optional;
+using executorch::aten::Tensor;
 
 Tensor& argmax_out(
     KernelRuntimeContext& ctx,
diff --git a/kernels/portable/cpu/op_argmin.cpp b/kernels/portable/cpu/op_argmin.cpp
index a3dca2cc62..1c4a2572ea 100644
--- a/kernels/portable/cpu/op_argmin.cpp
+++ b/kernels/portable/cpu/op_argmin.cpp
@@ -17,8 +17,8 @@ namespace torch {
 namespace executor {
 namespace native {
 
-using exec_aten::optional;
-using exec_aten::Tensor;
+using executorch::aten::optional;
+using executorch::aten::Tensor;
 
 Tensor& argmin_out(
     KernelRuntimeContext& ctx,
diff --git a/kernels/portable/cpu/op_as_strided_copy.cpp b/kernels/portable/cpu/op_as_strided_copy.cpp
index d18323b92f..060e3cf076 100644
--- a/kernels/portable/cpu/op_as_strided_copy.cpp
+++ b/kernels/portable/cpu/op_as_strided_copy.cpp
@@ -13,8 +13,8 @@ namespace torch {
 namespace executor {
 namespace native {
 
-using Tensor = exec_aten::Tensor;
-using ScalarType = exec_aten::ScalarType;
+using Tensor = executorch::aten::Tensor;
+using ScalarType = executorch::aten::ScalarType;
 
 Tensor& as_strided_copy_out(
     KernelRuntimeContext& ctx,
diff --git a/kernels/portable/cpu/op_avg_pool2d.cpp b/kernels/portable/cpu/op_avg_pool2d.cpp
index bbdd9a848e..e7cd61ba20 100644
--- a/kernels/portable/cpu/op_avg_pool2d.cpp
+++ b/kernels/portable/cpu/op_avg_pool2d.cpp
@@ -16,9 +16,9 @@ namespace torch {
 namespace executor {
 namespace native {
 
-using Tensor = exec_aten::Tensor;
-using ScalarType = exec_aten::ScalarType;
-using IntArrayRef = exec_aten::ArrayRef<int64_t>;
+using Tensor = executorch::aten::Tensor;
+using ScalarType = executorch::aten::ScalarType;
+using IntArrayRef = executorch::aten::ArrayRef<int64_t>;
 
 Tensor& avg_pool2d_out(
     KernelRuntimeContext& ctx,
@@ -28,7 +28,7 @@ Tensor& avg_pool2d_out(
     IntArrayRef padding,
     bool ceil_mode,
     bool count_include_pad,
-    exec_aten::optional<int64_t> divisor_override,
+    executorch::aten::optional<int64_t> divisor_override,
     Tensor& out) {
   ET_KERNEL_CHECK(
       ctx,
@@ -50,7 +50,7 @@ Tensor& avg_pool2d_out(
   ET_KERNEL_CHECK(ctx, tensor_is_default_dim_order(in), InvalidArgument, out);
 
   size_t output_ndim = 0;
-  exec_aten::SizesType output_sizes[kTensorDimensionLimit];
+  executorch::aten::SizesType output_sizes[kTensorDimensionLimit];
   get_avg_pool2d_out_target_size(
       in, kernel_size, stride, padding, ceil_mode, output_sizes, &output_ndim);
 
diff --git a/kernels/portable/cpu/op_bitwise_not.cpp b/kernels/portable/cpu/op_bitwise_not.cpp
index ccbfc0b74b..c28cb37430 100644
--- a/kernels/portable/cpu/op_bitwise_not.cpp
+++ b/kernels/portable/cpu/op_bitwise_not.cpp
@@ -15,7 +15,7 @@ namespace torch {
 namespace executor {
 namespace native {
 
-using exec_aten::Tensor;
+using executorch::aten::Tensor;
 
 /**
  * Computes the bitwise NOT of the given input tensor. The input tensor must be
@@ -37,7 +37,7 @@ bitwise_not_out(KernelRuntimeContext& ctx, const Tensor& in, Tensor& out) {
   ET_KERNEL_CHECK(
       ctx, tensors_have_same_dim_order(in, out), InvalidArgument, out);
 
-  if (in.scalar_type() == exec_aten::ScalarType::Bool) {
+  if (in.scalar_type() == executorch::aten::ScalarType::Bool) {
     apply_unary_map_fn(
         [](const bool val_in) { return !val_in; },
         in.const_data_ptr<bool>(),
diff --git a/kernels/portable/cpu/op_bmm.cpp b/kernels/portable/cpu/op_bmm.cpp
index 8ab7a1b8ec..b9f9d4f2c9 100644
--- a/kernels/portable/cpu/op_bmm.cpp
+++ b/kernels/portable/cpu/op_bmm.cpp
@@ -14,7 +14,7 @@ namespace torch {
 namespace executor {
 namespace native {
 
-using Tensor = exec_aten::Tensor;
+using Tensor = executorch::aten::Tensor;
 
 Tensor& bmm_out(
     KernelRuntimeContext& ctx,
@@ -29,7 +29,7 @@ Tensor& bmm_out(
   ET_KERNEL_CHECK(ctx, tensor_is_default_dim_order(in), InvalidArgument, out);
 
   size_t output_ndim = 0;
-  exec_aten::SizesType output_sizes[kTensorDimensionLimit];
+  executorch::aten::SizesType output_sizes[kTensorDimensionLimit];
   get_bmm_out_target_size(in, mat2, output_sizes, &output_ndim);
   ET_KERNEL_CHECK(
       ctx,
diff --git a/kernels/portable/cpu/op_cat.cpp b/kernels/portable/cpu/op_cat.cpp
index 26f277a851..04a7a58a99 100644
--- a/kernels/portable/cpu/op_cat.cpp
+++ b/kernels/portable/cpu/op_cat.cpp
@@ -15,11 +15,11 @@ namespace torch {
 namespace executor {
 namespace native {
 
-using Tensor = exec_aten::Tensor;
+using Tensor = executorch::aten::Tensor;
 
 Tensor& cat_out(
     KernelRuntimeContext& ctx,
-    exec_aten::ArrayRef<Tensor> tensors,
+    executorch::aten::ArrayRef<Tensor> tensors,
     int64_t dim,
     Tensor& out) {
   if (dim < 0) {
diff --git a/kernels/portable/cpu/op_cdist_forward.cpp b/kernels/portable/cpu/op_cdist_forward.cpp
index 4b4c9a154f..1489ec6f6e 100644
--- a/kernels/portable/cpu/op_cdist_forward.cpp
+++ b/kernels/portable/cpu/op_cdist_forward.cpp
@@ -14,8 +14,8 @@ namespace torch {
 namespace executor {
 namespace native {
 
-using exec_aten::optional;
-using exec_aten::Tensor;
+using executorch::aten::optional;
+using executorch::aten::Tensor;
 
 namespace {
 
diff --git a/kernels/portable/cpu/op_ceil.cpp b/kernels/portable/cpu/op_ceil.cpp
index f6e9951d5e..e2d9f9d602 100644
--- a/kernels/portable/cpu/op_ceil.cpp
+++ b/kernels/portable/cpu/op_ceil.cpp
@@ -14,7 +14,7 @@ namespace torch {
 namespace executor {
 namespace native {
 
-using exec_aten::Tensor;
+using executorch::aten::Tensor;
 
 Tensor& ceil_out(KernelRuntimeContext& ctx, const Tensor& in, Tensor& out) {
   return internal::unary_ufunc_realh(std::ceil, ctx, in, out);
diff --git a/kernels/portable/cpu/op_clamp.cpp b/kernels/portable/cpu/op_clamp.cpp
index 3f282a4473..c1c40a38f3 100644
--- a/kernels/portable/cpu/op_clamp.cpp
+++ b/kernels/portable/cpu/op_clamp.cpp
@@ -20,9 +20,9 @@ namespace torch {
 namespace executor {
 namespace native {
 
-using Scalar = exec_aten::Scalar;
-using ScalarType = exec_aten::ScalarType;
-using Tensor = exec_aten::Tensor;
+using Scalar = executorch::aten::Scalar;
+using ScalarType = executorch::aten::ScalarType;
+using Tensor = executorch::aten::Tensor;
 
 namespace {
 
@@ -70,8 +70,8 @@ ET_NODISCARD bool check_bounds(
 Tensor& clamp_out(
     KernelRuntimeContext& ctx,
     const Tensor& in,
-    const exec_aten::optional<Scalar>& min_opt,
-    const exec_aten::optional<Scalar>& max_opt,
+    const executorch::aten::optional<Scalar>& min_opt,
+    const executorch::aten::optional<Scalar>& max_opt,
     Tensor& out) {
   bool has_min = min_opt.has_value();
   bool has_max = max_opt.has_value();
@@ -160,8 +160,8 @@ Tensor& clamp_out(
 Tensor& clamp_tensor_out(
     KernelRuntimeContext& ctx,
     const Tensor& in,
-    const exec_aten::optional<Tensor>& min_opt,
-    const exec_aten::optional<Tensor>& max_opt,
+    const executorch::aten::optional<Tensor>& min_opt,
+    const executorch::aten::optional<Tensor>& max_opt,
     Tensor& out) {
   bool has_min = min_opt.has_value();
   bool has_max = max_opt.has_value();
diff --git a/kernels/portable/cpu/op_clone.cpp b/kernels/portable/cpu/op_clone.cpp
index 4350cee432..6026953a7f 100644
--- a/kernels/portable/cpu/op_clone.cpp
+++ b/kernels/portable/cpu/op_clone.cpp
@@ -14,14 +14,14 @@ namespace torch {
 namespace executor {
 namespace native {
 
-using Tensor = exec_aten::Tensor;
+using Tensor = executorch::aten::Tensor;
 
 // clone.out(Tensor self, *, MemoryFormat? memory_format=None, Tensor(a!) out)
 // -> Tensor(a!)
 Tensor& clone_out(
     KernelRuntimeContext& context,
     const Tensor& self,
-    exec_aten::optional<exec_aten::MemoryFormat> memory_format,
+    executorch::aten::optional<executorch::aten::MemoryFormat> memory_format,
     Tensor& out) {
   (void)context;
 
diff --git a/kernels/portable/cpu/op_convolution.cpp b/kernels/portable/cpu/op_convolution.cpp
index 116c32e8b1..cdd37e8f78 100644
--- a/kernels/portable/cpu/op_convolution.cpp
+++ b/kernels/portable/cpu/op_convolution.cpp
@@ -17,12 +17,14 @@ namespace torch {
 namespace executor {
 namespace native {
 
-using Tensor = exec_aten::Tensor;
-using ScalarType = exec_aten::ScalarType;
-using IntArrayRef = exec_aten::ArrayRef<int64_t>;
-using SizesArrayRef = exec_aten::ArrayRef<exec_aten::SizesType>;
-using DimOrderArrayRef = exec_aten::ArrayRef<exec_aten::DimOrderType>;
-using StridesArrayRef = exec_aten::ArrayRef<exec_aten::StridesType>;
+using Tensor = executorch::aten::Tensor;
+using ScalarType = executorch::aten::ScalarType;
+using IntArrayRef = executorch::aten::ArrayRef<int64_t>;
+using SizesArrayRef = executorch::aten::ArrayRef<executorch::aten::SizesType>;
+using DimOrderArrayRef =
+    executorch::aten::ArrayRef<executorch::aten::DimOrderType>;
+using StridesArrayRef =
+    executorch::aten::ArrayRef<executorch::aten::StridesType>;
 
 namespace {
 
@@ -41,7 +43,7 @@ void conv2d_impl(
     const CTYPE* const w_ptr,
     SizesArrayRef w_sizes,
     StridesArrayRef w_strides,
-    const exec_aten::optional<Tensor>& bias,
+    const executorch::aten::optional<Tensor>& bias,
     const char* const bias_ptr,
     LoadFn load_bias,
     IntArrayRef stride,
@@ -72,12 +74,12 @@ void conv2d_impl(
   size_t out_C_per_group = out_C / groups;
   size_t out_c_start = group * out_C_per_group;
 
-  exec_aten::SizesType in_coord[kTensorDimensionLimit];
+  executorch::aten::SizesType in_coord[kTensorDimensionLimit];
   in_coord[0] = batch;
-  exec_aten::SizesType out_coord[kTensorDimensionLimit];
+  executorch::aten::SizesType out_coord[kTensorDimensionLimit];
   out_coord[0] = batch;
   out_coord[1] = out_c;
-  exec_aten::SizesType w_coord[kTensorDimensionLimit];
+  executorch::aten::SizesType w_coord[kTensorDimensionLimit];
 
   const int64_t stride_y = val_at(stride, 0);
   const int64_t padding_y = val_at(padding, 0, /*default_value=*/0);
@@ -192,7 +194,7 @@ template <typename CTYPE, typename LoadFn = CTYPE (*)(const void*)>
 void convolution_wrapper(
     const Tensor& in,
     const Tensor& weight,
-    const exec_aten::optional<Tensor>& bias,
+    const executorch::aten::optional<Tensor>& bias,
     LoadFn load_bias,
     IntArrayRef stride,
     IntArrayRef padding,
@@ -213,14 +215,14 @@ void convolution_wrapper(
   IntArrayRef dilation_ = dilation;
 
   // Define arrays for modified sizes, etc. which will potentially be used
-  exec_aten::SizesType in_sizes_arr[kTensorDimensionLimit];
-  exec_aten::DimOrderType in_dim_order_arr[kTensorDimensionLimit];
+  executorch::aten::SizesType in_sizes_arr[kTensorDimensionLimit];
+  executorch::aten::DimOrderType in_dim_order_arr[kTensorDimensionLimit];
   size_t in_ndim;
-  exec_aten::SizesType weight_sizes_arr[kTensorDimensionLimit];
-  exec_aten::DimOrderType weight_dim_order_arr[kTensorDimensionLimit];
+  executorch::aten::SizesType weight_sizes_arr[kTensorDimensionLimit];
+  executorch::aten::DimOrderType weight_dim_order_arr[kTensorDimensionLimit];
   size_t weight_ndim;
-  exec_aten::SizesType out_sizes_arr[kTensorDimensionLimit];
-  exec_aten::DimOrderType out_dim_order_arr[kTensorDimensionLimit];
+  executorch::aten::SizesType out_sizes_arr[kTensorDimensionLimit];
+  executorch::aten::DimOrderType out_dim_order_arr[kTensorDimensionLimit];
   size_t out_ndim;
 
   int64_t stride_arr[2];
@@ -266,18 +268,18 @@ void convolution_wrapper(
     dilation_ = {dilation_arr, 2};
   }
 
-  exec_aten::StridesType in_strides[kTensorDimensionLimit];
+  executorch::aten::StridesType in_strides[kTensorDimensionLimit];
   dim_order_to_stride_nocheck(
       in_sizes.data(), in_dim_order.data(), in_sizes.size(), in_strides);
 
-  exec_aten::StridesType weight_strides[kTensorDimensionLimit];
+  executorch::aten::StridesType weight_strides[kTensorDimensionLimit];
   dim_order_to_stride_nocheck(
       weight_sizes.data(),
       weight_dim_order.data(),
       weight_sizes.size(),
       weight_strides);
 
-  exec_aten::StridesType out_strides[kTensorDimensionLimit];
+  executorch::aten::StridesType out_strides[kTensorDimensionLimit];
   dim_order_to_stride_nocheck(
       out_sizes.data(), out_dim_order.data(), out_sizes.size(), out_strides);
 
@@ -347,7 +349,7 @@ Tensor& convolution_out(
     KernelRuntimeContext& ctx,
     const Tensor& in,
     const Tensor& weight,
-    const exec_aten::optional<Tensor>& bias,
+    const executorch::aten::optional<Tensor>& bias,
     IntArrayRef stride,
     IntArrayRef padding,
     IntArrayRef dilation,
@@ -377,7 +379,7 @@ Tensor& convolution_out(
       ctx, tensors_have_same_dim_order(in, out), InvalidArgument, out);
 
   size_t output_ndim = 0;
-  exec_aten::SizesType output_sizes[kTensorDimensionLimit];
+  executorch::aten::SizesType output_sizes[kTensorDimensionLimit];
   get_convolution_out_target_size(
       in,
       weight,
diff --git a/kernels/portable/cpu/op_convolution_backward.cpp b/kernels/portable/cpu/op_convolution_backward.cpp
index 9fcd70502e..848b66ec55 100644
--- a/kernels/portable/cpu/op_convolution_backward.cpp
+++ b/kernels/portable/cpu/op_convolution_backward.cpp
@@ -16,10 +16,10 @@ namespace torch {
 namespace executor {
 namespace native {
 
-using Tensor = exec_aten::Tensor;
-using ScalarType = exec_aten::ScalarType;
-using IntArrayRef = exec_aten::ArrayRef<int64_t>;
-using OptIntArrayRef = exec_aten::OptionalArrayRef<int64_t>;
+using Tensor = executorch::aten::Tensor;
+using ScalarType = executorch::aten::ScalarType;
+using IntArrayRef = executorch::aten::ArrayRef<int64_t>;
+using OptIntArrayRef = executorch::aten::OptionalArrayRef<int64_t>;
 
 namespace {
 
@@ -34,7 +34,7 @@ bool check_convolution_backward_args(
     bool transposed,
     IntArrayRef output_padding,
     int64_t groups,
-    ET_UNUSED exec_aten::ArrayRef<bool> output_mask,
+    ET_UNUSED executorch::aten::ArrayRef<bool> output_mask,
     Tensor& grad_input,
     Tensor& grad_weight,
     Tensor& grad_bias) {
@@ -53,7 +53,7 @@ bool check_convolution_backward_args(
       check_convolution_args(
           input,
           weight,
-          exec_aten::optional<Tensor>(),
+          executorch::aten::optional<Tensor>(),
           stride,
           padding,
           dilation,
@@ -64,7 +64,7 @@ bool check_convolution_backward_args(
       "Invalid convolution arguments");
 
   size_t output_ndim = 0;
-  exec_aten::SizesType output_sizes[kTensorDimensionLimit];
+  executorch::aten::SizesType output_sizes[kTensorDimensionLimit];
   get_convolution_out_target_size(
       input,
       weight,
@@ -99,7 +99,7 @@ void conv2d_backward_impl(
     IntArrayRef padding,
     IntArrayRef dilation,
     int64_t groups,
-    exec_aten::ArrayRef<bool> output_mask,
+    executorch::aten::ArrayRef<bool> output_mask,
     Tensor& grad_input,
     Tensor& grad_weight,
     Tensor& grad_bias) {
@@ -147,11 +147,11 @@ void conv2d_backward_impl(
   }
 
   // @lint-ignore CLANGTIDY facebook-hte-CArray
-  exec_aten::SizesType out_coord[kTensorDimensionLimit];
+  executorch::aten::SizesType out_coord[kTensorDimensionLimit];
   // @lint-ignore CLANGTIDY facebook-hte-CArray
-  exec_aten::SizesType in_coord[kTensorDimensionLimit];
+  executorch::aten::SizesType in_coord[kTensorDimensionLimit];
   // @lint-ignore CLANGTIDY facebook-hte-CArray
-  exec_aten::SizesType weight_coord[kTensorDimensionLimit];
+  executorch::aten::SizesType weight_coord[kTensorDimensionLimit];
 
   // Compute gradients
   for (int64_t b = 0; b < batch_size; ++b) { // Loop over each batch
@@ -238,7 +238,7 @@ std::tuple<Tensor&, Tensor&, Tensor&> convolution_backward_out(
     bool transposed,
     IntArrayRef output_padding,
     int64_t groups,
-    exec_aten::ArrayRef<bool> output_mask,
+    executorch::aten::ArrayRef<bool> output_mask,
     Tensor& grad_input,
     Tensor& grad_weight,
     Tensor& grad_bias) {
diff --git a/kernels/portable/cpu/op_copy.cpp b/kernels/portable/cpu/op_copy.cpp
index cb9e9de2da..19b0c3a2f6 100644
--- a/kernels/portable/cpu/op_copy.cpp
+++ b/kernels/portable/cpu/op_copy.cpp
@@ -16,7 +16,7 @@ namespace torch {
 namespace executor {
 namespace native {
 
-using Tensor = exec_aten::Tensor;
+using Tensor = executorch::aten::Tensor;
 
 // copy.out(const Tensor& in, const Tensor& src, bool non_blocking, Tensor(a!)
 // out) -> Tensor(a!), see caffe2/aten/src/ATen/native/Copy.cpp
diff --git a/kernels/portable/cpu/op_cumsum.cpp b/kernels/portable/cpu/op_cumsum.cpp
index 13b66f1c01..2faa67433d 100644
--- a/kernels/portable/cpu/op_cumsum.cpp
+++ b/kernels/portable/cpu/op_cumsum.cpp
@@ -19,8 +19,8 @@ namespace torch {
 namespace executor {
 namespace native {
 
-using Tensor = exec_aten::Tensor;
-using ScalarType = exec_aten::ScalarType;
+using Tensor = executorch::aten::Tensor;
+using ScalarType = executorch::aten::ScalarType;
 
 namespace {
 /**
diff --git a/kernels/portable/cpu/op_detach_copy.cpp b/kernels/portable/cpu/op_detach_copy.cpp
index 6cb069c375..2a51e67720 100644
--- a/kernels/portable/cpu/op_detach_copy.cpp
+++ b/kernels/portable/cpu/op_detach_copy.cpp
@@ -13,8 +13,8 @@ namespace torch {
 namespace executor {
 namespace native {
 
-using Tensor = exec_aten::Tensor;
-using ScalarType = exec_aten::ScalarType;
+using Tensor = executorch::aten::Tensor;
+using ScalarType = executorch::aten::ScalarType;
 
 namespace {} // namespace
 
diff --git a/kernels/portable/cpu/op_div.cpp b/kernels/portable/cpu/op_div.cpp
index 9f33907b99..94cd9ea501 100644
--- a/kernels/portable/cpu/op_div.cpp
+++ b/kernels/portable/cpu/op_div.cpp
@@ -78,7 +78,7 @@ Tensor& div_out_mode(
     KernelRuntimeContext& ctx,
     const Tensor& a,
     const Tensor& b,
-    exec_aten::optional<exec_aten::string_view> mode,
+    executorch::aten::optional<executorch::aten::string_view> mode,
     Tensor& out) {
   if (!mode.has_value()) {
     return div_out(ctx, a, b, out);
@@ -204,7 +204,7 @@ Tensor& div_scalar_mode_out(
     KernelRuntimeContext& ctx,
     const Tensor& a,
     const Scalar& b,
-    exec_aten::optional<exec_aten::string_view> mode,
+    executorch::aten::optional<executorch::aten::string_view> mode,
     Tensor& out) {
   if (!mode.has_value()) {
     return div_scalar_out(ctx, a, b, out);
diff --git a/kernels/portable/cpu/op_embedding.cpp b/kernels/portable/cpu/op_embedding.cpp
index 109a72b2f8..acde09ebdc 100644
--- a/kernels/portable/cpu/op_embedding.cpp
+++ b/kernels/portable/cpu/op_embedding.cpp
@@ -21,8 +21,8 @@ namespace torch {
 namespace executor {
 namespace native {
 
-using Tensor = exec_aten::Tensor;
-using ScalarType = exec_aten::ScalarType;
+using Tensor = executorch::aten::Tensor;
+using ScalarType = executorch::aten::ScalarType;
 
 namespace {
 
diff --git a/kernels/portable/cpu/op_empty.cpp b/kernels/portable/cpu/op_empty.cpp
index 9b37a527c9..3e3ee66cac 100644
--- a/kernels/portable/cpu/op_empty.cpp
+++ b/kernels/portable/cpu/op_empty.cpp
@@ -16,7 +16,7 @@ namespace torch {
 namespace executor {
 namespace native {
 
-using exec_aten::Tensor;
+using executorch::aten::Tensor;
 
 /*
  * Empty out tensor
@@ -26,7 +26,7 @@ using exec_aten::Tensor;
 Tensor& empty_out(
     KernelRuntimeContext& context,
     IntArrayRef size,
-    exec_aten::optional<exec_aten::MemoryFormat> memory_format,
+    executorch::aten::optional<executorch::aten::MemoryFormat> memory_format,
     Tensor& out) {
   (void)context;
 
diff --git a/kernels/portable/cpu/op_expand_copy.cpp b/kernels/portable/cpu/op_expand_copy.cpp
index 26be7fa5a0..f1a7bfbf1f 100644
--- a/kernels/portable/cpu/op_expand_copy.cpp
+++ b/kernels/portable/cpu/op_expand_copy.cpp
@@ -18,18 +18,18 @@ namespace torch {
 namespace executor {
 namespace native {
 
-using Tensor = exec_aten::Tensor;
-using ScalarType = exec_aten::ScalarType;
-using Scalar = exec_aten::Scalar;
-using SizesType = exec_aten::SizesType;
+using Tensor = executorch::aten::Tensor;
+using ScalarType = executorch::aten::ScalarType;
+using Scalar = executorch::aten::Scalar;
+using SizesType = executorch::aten::SizesType;
 
 constexpr const size_t kTensorDimensionLimit{16};
 
 namespace {
 
 size_t map_expand_to_repeats(
-    exec_aten::ArrayRef<SizesType> self_sizes,
-    exec_aten::ArrayRef<int64_t> expand_sizes,
+    executorch::aten::ArrayRef<SizesType> self_sizes,
+    executorch::aten::ArrayRef<int64_t> expand_sizes,
     int64_t* repeats,
     const size_t repeats_size) {
   auto j{expand_sizes.size()};
@@ -70,7 +70,7 @@ Tensor& expand_copy_out(
   const auto& self_sizes = self.sizes();
 
   // Holds the result of converting -1 to the original dim sizes
-  exec_aten::SizesType output_sizes[kTensorDimensionLimit];
+  executorch::aten::SizesType output_sizes[kTensorDimensionLimit];
   size_t output_rank = 0;
   ET_KERNEL_CHECK(
       ctx,
diff --git a/kernels/portable/cpu/op_fill.cpp b/kernels/portable/cpu/op_fill.cpp
index 180af93f88..55187ba1b1 100644
--- a/kernels/portable/cpu/op_fill.cpp
+++ b/kernels/portable/cpu/op_fill.cpp
@@ -14,9 +14,9 @@ namespace torch {
 namespace executor {
 namespace native {
 
-using Scalar = exec_aten::Scalar;
-using ScalarType = exec_aten::ScalarType;
-using Tensor = exec_aten::Tensor;
+using Scalar = executorch::aten::Scalar;
+using ScalarType = executorch::aten::ScalarType;
+using Tensor = executorch::aten::Tensor;
 
 Tensor& fill_scalar_out(
     KernelRuntimeContext& ctx,
diff --git a/kernels/portable/cpu/op_floor.cpp b/kernels/portable/cpu/op_floor.cpp
index b96265aa55..f389ef06a7 100644
--- a/kernels/portable/cpu/op_floor.cpp
+++ b/kernels/portable/cpu/op_floor.cpp
@@ -14,7 +14,7 @@ namespace torch {
 namespace executor {
 namespace native {
 
-using exec_aten::Tensor;
+using executorch::aten::Tensor;
 
 Tensor& floor_out(KernelRuntimeContext& ctx, const Tensor& in, Tensor& out) {
   return internal::unary_ufunc_realh(std::floor, ctx, in, out);
diff --git a/kernels/portable/cpu/op_full.cpp b/kernels/portable/cpu/op_full.cpp
index 30ca7b825f..cf73447c1b 100644
--- a/kernels/portable/cpu/op_full.cpp
+++ b/kernels/portable/cpu/op_full.cpp
@@ -13,8 +13,8 @@ namespace torch {
 namespace executor {
 namespace native {
 
-using Tensor = exec_aten::Tensor;
-using ScalarType = exec_aten::ScalarType;
+using Tensor = executorch::aten::Tensor;
+using ScalarType = executorch::aten::ScalarType;
 
 Tensor& full_out(
     KernelRuntimeContext& ctx,
diff --git a/kernels/portable/cpu/op_full_like.cpp b/kernels/portable/cpu/op_full_like.cpp
index 6cb64cebbd..508fa51219 100644
--- a/kernels/portable/cpu/op_full_like.cpp
+++ b/kernels/portable/cpu/op_full_like.cpp
@@ -13,8 +13,8 @@ namespace torch {
 namespace executor {
 namespace native {
 
-using Tensor = exec_aten::Tensor;
-using ScalarType = exec_aten::ScalarType;
+using Tensor = executorch::aten::Tensor;
+using ScalarType = executorch::aten::ScalarType;
 
 Tensor& full_like_out(
     KernelRuntimeContext& ctx,
diff --git a/kernels/portable/cpu/op_gather.cpp b/kernels/portable/cpu/op_gather.cpp
index b221b45075..3f2e365503 100644
--- a/kernels/portable/cpu/op_gather.cpp
+++ b/kernels/portable/cpu/op_gather.cpp
@@ -17,8 +17,8 @@ namespace torch {
 namespace executor {
 namespace native {
 
-using Tensor = exec_aten::Tensor;
-using ScalarType = exec_aten::ScalarType;
+using Tensor = executorch::aten::Tensor;
+using ScalarType = executorch::aten::ScalarType;
 
 namespace {
 
diff --git a/kernels/portable/cpu/op_gelu.cpp b/kernels/portable/cpu/op_gelu.cpp
index beff9ab8a3..c3e5c62d67 100644
--- a/kernels/portable/cpu/op_gelu.cpp
+++ b/kernels/portable/cpu/op_gelu.cpp
@@ -17,9 +17,9 @@ namespace torch {
 namespace executor {
 namespace native {
 
-using Tensor = exec_aten::Tensor;
-using ScalarType = exec_aten::ScalarType;
-using string_view = exec_aten::string_view;
+using Tensor = executorch::aten::Tensor;
+using ScalarType = executorch::aten::ScalarType;
+using string_view = executorch::aten::string_view;
 
 Tensor& gelu_out(
     KernelRuntimeContext& ctx,
diff --git a/kernels/portable/cpu/op_glu.cpp b/kernels/portable/cpu/op_glu.cpp
index 9374d17e86..20fb3cf029 100644
--- a/kernels/portable/cpu/op_glu.cpp
+++ b/kernels/portable/cpu/op_glu.cpp
@@ -17,8 +17,8 @@ namespace torch {
 namespace executor {
 namespace native {
 
-using Tensor = exec_aten::Tensor;
-using ScalarType = exec_aten::ScalarType;
+using Tensor = executorch::aten::Tensor;
+using ScalarType = executorch::aten::ScalarType;
 
 namespace {
 
diff --git a/kernels/portable/cpu/op_hardtanh.cpp b/kernels/portable/cpu/op_hardtanh.cpp
index 56ac77b37f..09855a175d 100644
--- a/kernels/portable/cpu/op_hardtanh.cpp
+++ b/kernels/portable/cpu/op_hardtanh.cpp
@@ -17,8 +17,8 @@ namespace torch {
 namespace executor {
 namespace native {
 
-using Tensor = exec_aten::Tensor;
-using ScalarType = exec_aten::ScalarType;
+using Tensor = executorch::aten::Tensor;
+using ScalarType = executorch::aten::ScalarType;
 
 Tensor& hardtanh_out(
     KernelRuntimeContext& ctx,
diff --git a/kernels/portable/cpu/op_index.cpp b/kernels/portable/cpu/op_index.cpp
index 98f76a9e35..c7887eaecb 100644
--- a/kernels/portable/cpu/op_index.cpp
+++ b/kernels/portable/cpu/op_index.cpp
@@ -19,8 +19,9 @@ namespace torch {
 namespace executor {
 namespace native {
 
-using Tensor = exec_aten::Tensor;
-using TensorOptList = exec_aten::ArrayRef<exec_aten::optional<Tensor>>;
+using Tensor = executorch::aten::Tensor;
+using TensorOptList =
+    executorch::aten::ArrayRef<executorch::aten::optional<Tensor>>;
 
 Tensor& index_Tensor_out(
     KernelRuntimeContext& ctx,
diff --git a/kernels/portable/cpu/op_index_put.cpp b/kernels/portable/cpu/op_index_put.cpp
index 33e67d207a..f22026d759 100644
--- a/kernels/portable/cpu/op_index_put.cpp
+++ b/kernels/portable/cpu/op_index_put.cpp
@@ -16,12 +16,12 @@ namespace torch {
 namespace executor {
 namespace native {
 
-using Tensor = exec_aten::Tensor;
+using Tensor = executorch::aten::Tensor;
 
 Tensor& index_put_out(
     KernelRuntimeContext& ctx,
     const Tensor& in,
-    exec_aten::ArrayRef<exec_aten::optional<Tensor>> indices,
+    executorch::aten::ArrayRef<executorch::aten::optional<Tensor>> indices,
     const Tensor& values,
     const bool accumulate,
     Tensor& out) {
diff --git a/kernels/portable/cpu/op_index_select.cpp b/kernels/portable/cpu/op_index_select.cpp
index 55a2eccf11..98f8f9f7ab 100644
--- a/kernels/portable/cpu/op_index_select.cpp
+++ b/kernels/portable/cpu/op_index_select.cpp
@@ -17,7 +17,7 @@ namespace torch {
 namespace executor {
 namespace native {
 
-using Tensor = exec_aten::Tensor;
+using Tensor = executorch::aten::Tensor;
 
 Tensor& index_select_out(
     KernelRuntimeContext& ctx,
diff --git a/kernels/portable/cpu/op_leaky_relu.cpp b/kernels/portable/cpu/op_leaky_relu.cpp
index 3493c26e47..71f01ef064 100644
--- a/kernels/portable/cpu/op_leaky_relu.cpp
+++ b/kernels/portable/cpu/op_leaky_relu.cpp
@@ -17,8 +17,8 @@ namespace torch {
 namespace executor {
 namespace native {
 
-using Tensor = exec_aten::Tensor;
-using ScalarType = exec_aten::ScalarType;
+using Tensor = executorch::aten::Tensor;
+using ScalarType = executorch::aten::ScalarType;
 
 Tensor& leaky_relu_out(
     KernelRuntimeContext& ctx,
diff --git a/kernels/portable/cpu/op_lift_fresh_copy.cpp b/kernels/portable/cpu/op_lift_fresh_copy.cpp
index bb49ab9e0b..dfc7e93cf1 100644
--- a/kernels/portable/cpu/op_lift_fresh_copy.cpp
+++ b/kernels/portable/cpu/op_lift_fresh_copy.cpp
@@ -14,7 +14,7 @@ namespace torch {
 namespace executor {
 namespace native {
 
-using Tensor = exec_aten::Tensor;
+using Tensor = executorch::aten::Tensor;
 
 Tensor&
 lift_fresh_copy_out(KernelRuntimeContext& ctx, const Tensor& in, Tensor& out) {
diff --git a/kernels/portable/cpu/op_linear_scratch_example.cpp b/kernels/portable/cpu/op_linear_scratch_example.cpp
index 1627e14ca7..b217e9ad94 100644
--- a/kernels/portable/cpu/op_linear_scratch_example.cpp
+++ b/kernels/portable/cpu/op_linear_scratch_example.cpp
@@ -20,10 +20,10 @@ namespace torch {
 namespace executor {
 namespace native {
 
-using Tensor = exec_aten::Tensor;
+using Tensor = executorch::aten::Tensor;
 
 template <typename T>
-using optional = exec_aten::optional<T>;
+using optional = executorch::aten::optional<T>;
 
 // kernel for demonstration purpose only
 
diff --git a/kernels/portable/cpu/op_log_softmax.cpp b/kernels/portable/cpu/op_log_softmax.cpp
index cbe5f2139f..69d5441072 100644
--- a/kernels/portable/cpu/op_log_softmax.cpp
+++ b/kernels/portable/cpu/op_log_softmax.cpp
@@ -17,7 +17,7 @@ namespace torch {
 namespace executor {
 namespace native {
 
-using Tensor = exec_aten::Tensor;
+using Tensor = executorch::aten::Tensor;
 
 Tensor& log_softmax_out(
     KernelRuntimeContext& ctx,
diff --git a/kernels/portable/cpu/op_logical_not.cpp b/kernels/portable/cpu/op_logical_not.cpp
index a67edf5cad..8c24705b74 100644
--- a/kernels/portable/cpu/op_logical_not.cpp
+++ b/kernels/portable/cpu/op_logical_not.cpp
@@ -14,7 +14,7 @@ namespace torch {
 namespace executor {
 namespace native {
 
-using exec_aten::Tensor;
+using executorch::aten::Tensor;
 
 Tensor&
 logical_not_out(KernelRuntimeContext& ctx, const Tensor& in, Tensor& out) {
diff --git a/kernels/portable/cpu/op_logit.cpp b/kernels/portable/cpu/op_logit.cpp
index f2388a1722..317ddb6fff 100644
--- a/kernels/portable/cpu/op_logit.cpp
+++ b/kernels/portable/cpu/op_logit.cpp
@@ -15,12 +15,12 @@ namespace torch {
 namespace executor {
 namespace native {
 
-using exec_aten::Tensor;
+using executorch::aten::Tensor;
 
 Tensor& logit_out(
     KernelRuntimeContext& ctx,
     const Tensor& in,
-    exec_aten::optional<double> eps,
+    executorch::aten::optional<double> eps,
     Tensor& out) {
   (void)ctx;
 
diff --git a/kernels/portable/cpu/op_masked_fill.cpp b/kernels/portable/cpu/op_masked_fill.cpp
index b3192b95c2..033c932080 100644
--- a/kernels/portable/cpu/op_masked_fill.cpp
+++ b/kernels/portable/cpu/op_masked_fill.cpp
@@ -15,9 +15,9 @@ namespace torch {
 namespace executor {
 namespace native {
 
-using Tensor = exec_aten::Tensor;
-using ScalarType = exec_aten::ScalarType;
-using Scalar = exec_aten::Scalar;
+using Tensor = executorch::aten::Tensor;
+using ScalarType = executorch::aten::ScalarType;
+using Scalar = executorch::aten::Scalar;
 
 Tensor& masked_fill_scalar_out(
     KernelRuntimeContext& ctx,
diff --git a/kernels/portable/cpu/op_max.cpp b/kernels/portable/cpu/op_max.cpp
index 53a63ff9a9..c5b5d2fb6b 100644
--- a/kernels/portable/cpu/op_max.cpp
+++ b/kernels/portable/cpu/op_max.cpp
@@ -26,9 +26,9 @@ constexpr CTYPE lower_bound() {
 
 } // namespace
 
-using ScalarType = exec_aten::ScalarType;
-using SizesType = exec_aten::SizesType;
-using Tensor = exec_aten::Tensor;
+using ScalarType = executorch::aten::ScalarType;
+using SizesType = executorch::aten::SizesType;
+using Tensor = executorch::aten::Tensor;
 
 std::tuple<Tensor&, Tensor&> max_out(
     KernelRuntimeContext& ctx,
diff --git a/kernels/portable/cpu/op_max_pool2d_with_indices.cpp b/kernels/portable/cpu/op_max_pool2d_with_indices.cpp
index 80c291305b..e00206b8c6 100644
--- a/kernels/portable/cpu/op_max_pool2d_with_indices.cpp
+++ b/kernels/portable/cpu/op_max_pool2d_with_indices.cpp
@@ -16,9 +16,9 @@ namespace torch {
 namespace executor {
 namespace native {
 
-using Tensor = exec_aten::Tensor;
-using ScalarType = exec_aten::ScalarType;
-using IntArrayRef = exec_aten::ArrayRef<int64_t>;
+using Tensor = executorch::aten::Tensor;
+using ScalarType = executorch::aten::ScalarType;
+using IntArrayRef = executorch::aten::ArrayRef<int64_t>;
 
 std::tuple<Tensor&, Tensor&> max_pool2d_with_indices_out(
     KernelRuntimeContext& ctx,
@@ -40,7 +40,7 @@ std::tuple<Tensor&, Tensor&> max_pool2d_with_indices_out(
       ret_val);
 
   size_t output_ndim = 0;
-  exec_aten::SizesType output_sizes[kTensorDimensionLimit];
+  executorch::aten::SizesType output_sizes[kTensorDimensionLimit];
   get_max_pool2d_with_indices_out_target_size(
       in,
       kernel_size,
@@ -70,7 +70,7 @@ std::tuple<Tensor&, Tensor&> max_pool2d_with_indices_out(
       ret_val);
 
   ScalarType in_type = in.scalar_type();
-  ET_SWITCH_REAL_TYPES(
+  ET_SWITCH_REALHBF16_TYPES(
       in_type, ctx, "max_pool2d_with_indices.out", CTYPE, [&]() {
         apply_kernel_2d_reduce_then_map_fn<CTYPE>(
             [](const CTYPE in_val,
diff --git a/kernels/portable/cpu/op_mean.cpp b/kernels/portable/cpu/op_mean.cpp
index 4be24c7b76..c0316e685d 100644
--- a/kernels/portable/cpu/op_mean.cpp
+++ b/kernels/portable/cpu/op_mean.cpp
@@ -15,8 +15,8 @@ namespace torch {
 namespace executor {
 namespace native {
 
-using Tensor = exec_aten::Tensor;
-using ScalarType = exec_aten::ScalarType;
+using Tensor = executorch::aten::Tensor;
+using ScalarType = executorch::aten::ScalarType;
 
 Tensor& mean_dim_out(
     KernelRuntimeContext& ctx,
diff --git a/kernels/portable/cpu/op_min.cpp b/kernels/portable/cpu/op_min.cpp
index c44d1c1572..ca8a9135cc 100644
--- a/kernels/portable/cpu/op_min.cpp
+++ b/kernels/portable/cpu/op_min.cpp
@@ -26,9 +26,9 @@ constexpr CTYPE upper_bound() {
 
 } // namespace
 
-using ScalarType = exec_aten::ScalarType;
-using SizesType = exec_aten::SizesType;
-using Tensor = exec_aten::Tensor;
+using ScalarType = executorch::aten::ScalarType;
+using SizesType = executorch::aten::SizesType;
+using Tensor = executorch::aten::Tensor;
 
 std::tuple<Tensor&, Tensor&> min_out(
     KernelRuntimeContext& ctx,
diff --git a/kernels/portable/cpu/op_mm.cpp b/kernels/portable/cpu/op_mm.cpp
index 80717d0f94..7c75095cce 100644
--- a/kernels/portable/cpu/op_mm.cpp
+++ b/kernels/portable/cpu/op_mm.cpp
@@ -14,7 +14,7 @@ namespace torch {
 namespace executor {
 namespace native {
 
-using Tensor = exec_aten::Tensor;
+using Tensor = executorch::aten::Tensor;
 
 Tensor& mm_out(
     KernelRuntimeContext& ctx,
@@ -24,7 +24,7 @@ Tensor& mm_out(
   ET_KERNEL_CHECK(ctx, check_mm_args(in, mat2, out), InvalidArgument, out);
 
   size_t output_ndim = 0;
-  exec_aten::SizesType output_sizes[kTensorDimensionLimit];
+  executorch::aten::SizesType output_sizes[kTensorDimensionLimit];
   get_mm_out_target_size(in, mat2, output_sizes, &output_ndim);
   ET_KERNEL_CHECK(
       ctx,
diff --git a/kernels/portable/cpu/op_narrow_copy.cpp b/kernels/portable/cpu/op_narrow_copy.cpp
index 762e0ad80e..960ea35efa 100644
--- a/kernels/portable/cpu/op_narrow_copy.cpp
+++ b/kernels/portable/cpu/op_narrow_copy.cpp
@@ -14,7 +14,7 @@ namespace torch {
 namespace executor {
 namespace native {
 
-using Tensor = exec_aten::Tensor;
+using Tensor = executorch::aten::Tensor;
 
 Tensor& narrow_copy_out(
     KernelRuntimeContext& ctx,
diff --git a/kernels/portable/cpu/op_native_batch_norm.cpp b/kernels/portable/cpu/op_native_batch_norm.cpp
index 060abebac4..ce4607f458 100644
--- a/kernels/portable/cpu/op_native_batch_norm.cpp
+++ b/kernels/portable/cpu/op_native_batch_norm.cpp
@@ -18,14 +18,14 @@ namespace torch {
 namespace executor {
 namespace native {
 
-using Tensor = exec_aten::Tensor;
-using SizesType = exec_aten::SizesType;
+using Tensor = executorch::aten::Tensor;
+using SizesType = executorch::aten::SizesType;
 
 std::tuple<Tensor&, Tensor&, Tensor&> _native_batch_norm_legit_no_training_out(
     KernelRuntimeContext& ctx,
     const Tensor& in,
-    const exec_aten::optional<Tensor>& weight,
-    const exec_aten::optional<Tensor>& bias,
+    const executorch::aten::optional<Tensor>& weight,
+    const executorch::aten::optional<Tensor>& bias,
     const Tensor& running_mean,
     const Tensor& running_var,
     double momentum,
@@ -139,8 +139,8 @@ std::tuple<Tensor&, Tensor&, Tensor&> _native_batch_norm_legit_no_training_out(
 std::tuple<Tensor&, Tensor&, Tensor&> _native_batch_norm_legit_out(
     KernelRuntimeContext& ctx,
     const Tensor& in,
-    const exec_aten::optional<Tensor>& weight,
-    const exec_aten::optional<Tensor>& bias,
+    const executorch::aten::optional<Tensor>& weight,
+    const executorch::aten::optional<Tensor>& bias,
     Tensor& running_mean,
     Tensor& running_var,
     bool training,
@@ -177,8 +177,8 @@ std::tuple<Tensor&, Tensor&, Tensor&> _native_batch_norm_legit_out(
 std::tuple<Tensor&, Tensor&, Tensor&> _native_batch_norm_legit_no_stats_out(
     KernelRuntimeContext& ctx,
     const Tensor& in,
-    const exec_aten::optional<Tensor>& weight,
-    const exec_aten::optional<Tensor>& bias,
+    const executorch::aten::optional<Tensor>& weight,
+    const executorch::aten::optional<Tensor>& bias,
     bool training,
     double momentum,
     double eps,
@@ -196,8 +196,8 @@ std::tuple<Tensor&, Tensor&, Tensor&> _native_batch_norm_legit_no_stats_out(
           in,
           weight,
           bias,
-          exec_aten::optional<Tensor>(),
-          exec_aten::optional<Tensor>(),
+          executorch::aten::optional<Tensor>(),
+          executorch::aten::optional<Tensor>(),
           momentum,
           eps,
           out,
diff --git a/kernels/portable/cpu/op_native_group_norm.cpp b/kernels/portable/cpu/op_native_group_norm.cpp
index cf2d19776b..d493753216 100644
--- a/kernels/portable/cpu/op_native_group_norm.cpp
+++ b/kernels/portable/cpu/op_native_group_norm.cpp
@@ -16,7 +16,7 @@ namespace torch {
 namespace executor {
 namespace native {
 
-using Tensor = exec_aten::Tensor;
+using Tensor = executorch::aten::Tensor;
 
 namespace {
 
@@ -115,8 +115,8 @@ void group_norm(
 std::tuple<Tensor&, Tensor&, Tensor&> native_group_norm_out(
     KernelRuntimeContext& ctx,
     const Tensor& input,
-    const exec_aten::optional<Tensor>& weight,
-    const exec_aten::optional<Tensor>& bias,
+    const executorch::aten::optional<Tensor>& weight,
+    const executorch::aten::optional<Tensor>& bias,
     int64_t N,
     int64_t C,
     int64_t HxW,
diff --git a/kernels/portable/cpu/op_native_layer_norm.cpp b/kernels/portable/cpu/op_native_layer_norm.cpp
index 788a844855..2e70e5d2ba 100644
--- a/kernels/portable/cpu/op_native_layer_norm.cpp
+++ b/kernels/portable/cpu/op_native_layer_norm.cpp
@@ -16,7 +16,7 @@ namespace torch {
 namespace executor {
 namespace native {
 
-using Tensor = exec_aten::Tensor;
+using Tensor = executorch::aten::Tensor;
 
 namespace {
 
@@ -101,8 +101,8 @@ std::tuple<Tensor&, Tensor&, Tensor&> native_layer_norm_out(
     KernelRuntimeContext& ctx,
     const Tensor& input,
     IntArrayRef normalized_shape,
-    const exec_aten::optional<Tensor>& weight,
-    const exec_aten::optional<Tensor>& bias,
+    const executorch::aten::optional<Tensor>& weight,
+    const executorch::aten::optional<Tensor>& bias,
     double eps,
     Tensor& out,
     Tensor& mean_out,
diff --git a/kernels/portable/cpu/op_neg.cpp b/kernels/portable/cpu/op_neg.cpp
index 339bfd8a44..4d8445e09b 100644
--- a/kernels/portable/cpu/op_neg.cpp
+++ b/kernels/portable/cpu/op_neg.cpp
@@ -14,7 +14,7 @@ namespace torch {
 namespace executor {
 namespace native {
 
-using exec_aten::Tensor;
+using executorch::aten::Tensor;
 
 Tensor& neg_out(KernelRuntimeContext& ctx, const Tensor& in, Tensor& out) {
   (void)ctx;
diff --git a/kernels/portable/cpu/op_nonzero.cpp b/kernels/portable/cpu/op_nonzero.cpp
index 77f80126d9..20e10be4b6 100644
--- a/kernels/portable/cpu/op_nonzero.cpp
+++ b/kernels/portable/cpu/op_nonzero.cpp
@@ -17,9 +17,9 @@ namespace torch {
 namespace executor {
 namespace native {
 
-using Tensor = exec_aten::Tensor;
-using ScalarType = exec_aten::ScalarType;
-using SizesType = exec_aten::SizesType;
+using Tensor = executorch::aten::Tensor;
+using ScalarType = executorch::aten::ScalarType;
+using SizesType = executorch::aten::SizesType;
 
 namespace {
 
@@ -56,7 +56,8 @@ void nonzero(KernelRuntimeContext& ctx, const Tensor& input, Tensor& output) {
       static_cast<SizesType>(num_nonzero), static_cast<SizesType>(input.dim())};
   ET_KERNEL_CHECK(
       ctx,
-      resize_tensor(output, ArrayRef<exec_aten::SizesType>(out_shape, 2)) ==
+      resize_tensor(
+          output, ArrayRef<executorch::aten::SizesType>(out_shape, 2)) ==
           Error::Ok,
       InvalidArgument, );
 
diff --git a/kernels/portable/cpu/op_pdist_forward.cpp b/kernels/portable/cpu/op_pdist_forward.cpp
index 1aa53e0cdd..e412e43aa0 100644
--- a/kernels/portable/cpu/op_pdist_forward.cpp
+++ b/kernels/portable/cpu/op_pdist_forward.cpp
@@ -13,7 +13,7 @@ namespace torch {
 namespace executor {
 namespace native {
 
-using Tensor = exec_aten::Tensor;
+using Tensor = executorch::aten::Tensor;
 
 Tensor& _pdist_forward_out(
     KernelRuntimeContext& ctx,
diff --git a/kernels/portable/cpu/op_permute_copy.cpp b/kernels/portable/cpu/op_permute_copy.cpp
index e75b34f28d..237b31ee98 100644
--- a/kernels/portable/cpu/op_permute_copy.cpp
+++ b/kernels/portable/cpu/op_permute_copy.cpp
@@ -13,9 +13,9 @@ namespace torch {
 namespace executor {
 namespace native {
 
-using SizesType = exec_aten::SizesType;
-using Tensor = exec_aten::Tensor;
-using IntArrayRef = exec_aten::ArrayRef<int64_t>;
+using SizesType = executorch::aten::SizesType;
+using Tensor = executorch::aten::Tensor;
+using IntArrayRef = executorch::aten::ArrayRef<int64_t>;
 
 namespace {
 
diff --git a/kernels/portable/cpu/op_pixel_shuffle.cpp b/kernels/portable/cpu/op_pixel_shuffle.cpp
index 0af790d985..a3bb417d9d 100644
--- a/kernels/portable/cpu/op_pixel_shuffle.cpp
+++ b/kernels/portable/cpu/op_pixel_shuffle.cpp
@@ -61,8 +61,8 @@ void pixel_shuffle_impl(const Tensor& in, int64_t upscale_factor, Tensor& out) {
 
 } // namespace
 
-using SizesType = exec_aten::SizesType;
-using Tensor = exec_aten::Tensor;
+using SizesType = executorch::aten::SizesType;
+using Tensor = executorch::aten::Tensor;
 
 Tensor& pixel_shuffle_out(
     KernelRuntimeContext& ctx,
diff --git a/kernels/portable/cpu/op_pixel_unshuffle.cpp b/kernels/portable/cpu/op_pixel_unshuffle.cpp
index f12a2e97ae..f0bd5e4d10 100644
--- a/kernels/portable/cpu/op_pixel_unshuffle.cpp
+++ b/kernels/portable/cpu/op_pixel_unshuffle.cpp
@@ -64,8 +64,8 @@ void pixel_unshuffle_impl(
 
 } // namespace
 
-using SizesType = exec_aten::SizesType;
-using Tensor = exec_aten::Tensor;
+using SizesType = executorch::aten::SizesType;
+using Tensor = executorch::aten::Tensor;
 
 Tensor& pixel_unshuffle_out(
     KernelRuntimeContext& ctx,
diff --git a/kernels/portable/cpu/op_prod.cpp b/kernels/portable/cpu/op_prod.cpp
index a1b9f72034..61bda38f68 100644
--- a/kernels/portable/cpu/op_prod.cpp
+++ b/kernels/portable/cpu/op_prod.cpp
@@ -13,8 +13,8 @@ namespace torch {
 namespace executor {
 namespace native {
 
-using Tensor = exec_aten::Tensor;
-using ScalarType = exec_aten::ScalarType;
+using Tensor = executorch::aten::Tensor;
+using ScalarType = executorch::aten::ScalarType;
 
 Tensor& prod_out(
     KernelRuntimeContext& ctx,
diff --git a/kernels/portable/cpu/op_reflection_pad1d.cpp b/kernels/portable/cpu/op_reflection_pad1d.cpp
index 253097df17..5f1b68e210 100644
--- a/kernels/portable/cpu/op_reflection_pad1d.cpp
+++ b/kernels/portable/cpu/op_reflection_pad1d.cpp
@@ -13,12 +13,12 @@ namespace torch {
 namespace executor {
 namespace native {
 
-using Tensor = exec_aten::Tensor;
+using Tensor = executorch::aten::Tensor;
 
 Tensor& reflection_pad1d_out(
     KernelRuntimeContext& ctx,
     const Tensor& in,
-    exec_aten::ArrayRef<int64_t> padding,
+    executorch::aten::ArrayRef<int64_t> padding,
     Tensor& out) {
   (void)ctx;
 
diff --git a/kernels/portable/cpu/op_reflection_pad2d.cpp b/kernels/portable/cpu/op_reflection_pad2d.cpp
index 31bfa13aff..821d09253c 100644
--- a/kernels/portable/cpu/op_reflection_pad2d.cpp
+++ b/kernels/portable/cpu/op_reflection_pad2d.cpp
@@ -13,12 +13,12 @@ namespace torch {
 namespace executor {
 namespace native {
 
-using Tensor = exec_aten::Tensor;
+using Tensor = executorch::aten::Tensor;
 
 Tensor& reflection_pad2d_out(
     KernelRuntimeContext& ctx,
     const Tensor& in,
-    exec_aten::ArrayRef<int64_t> padding,
+    executorch::aten::ArrayRef<int64_t> padding,
     Tensor& out) {
   (void)ctx;
 
diff --git a/kernels/portable/cpu/op_reflection_pad3d.cpp b/kernels/portable/cpu/op_reflection_pad3d.cpp
index 889a38fbe2..cb0dd39a07 100644
--- a/kernels/portable/cpu/op_reflection_pad3d.cpp
+++ b/kernels/portable/cpu/op_reflection_pad3d.cpp
@@ -13,12 +13,12 @@ namespace torch {
 namespace executor {
 namespace native {
 
-using Tensor = exec_aten::Tensor;
+using Tensor = executorch::aten::Tensor;
 
 Tensor& reflection_pad3d_out(
     KernelRuntimeContext& ctx,
     const Tensor& in,
-    exec_aten::ArrayRef<int64_t> padding,
+    executorch::aten::ArrayRef<int64_t> padding,
     Tensor& out) {
   (void)ctx;
 
diff --git a/kernels/portable/cpu/op_relu.cpp b/kernels/portable/cpu/op_relu.cpp
index e8c265fba4..973542a2a7 100644
--- a/kernels/portable/cpu/op_relu.cpp
+++ b/kernels/portable/cpu/op_relu.cpp
@@ -16,8 +16,8 @@ namespace torch {
 namespace executor {
 namespace native {
 
-using Tensor = exec_aten::Tensor;
-using ScalarType = exec_aten::ScalarType;
+using Tensor = executorch::aten::Tensor;
+using ScalarType = executorch::aten::ScalarType;
 
 Tensor& relu_out(KernelRuntimeContext& ctx, const Tensor& in, Tensor& out) {
   (void)ctx;
diff --git a/kernels/portable/cpu/op_repeat.cpp b/kernels/portable/cpu/op_repeat.cpp
index e136b8c725..8b64eefde3 100644
--- a/kernels/portable/cpu/op_repeat.cpp
+++ b/kernels/portable/cpu/op_repeat.cpp
@@ -18,8 +18,8 @@ namespace native {
 namespace {
 
 bool calculate_output_size(
-    const exec_aten::ArrayRef<exec_aten::SizesType>& self_sizes,
-    const exec_aten::ArrayRef<int64_t>& repeats,
+    const executorch::aten::ArrayRef<executorch::aten::SizesType>& self_sizes,
+    const executorch::aten::ArrayRef<int64_t>& repeats,
     Tensor::SizesType* out_sizes_ptr) {
   ET_LOG_AND_RETURN_IF_FALSE(repeats.size() < kTensorDimensionLimit);
 
@@ -31,12 +31,12 @@ bool calculate_output_size(
 
   int32_t i = 0;
   for (; i < (repeats.size() - self_sizes.size()); ++i) {
-    out_sizes_ptr[i] = static_cast<exec_aten::SizesType>(repeats[i]);
+    out_sizes_ptr[i] = static_cast<executorch::aten::SizesType>(repeats[i]);
   }
   int32_t j = 0;
   for (; i < repeats.size(); ++i) {
     out_sizes_ptr[i] =
-        static_cast<exec_aten::SizesType>(repeats[i]) * self_sizes[j];
+        static_cast<executorch::aten::SizesType>(repeats[i]) * self_sizes[j];
     j++;
   }
 
@@ -45,13 +45,13 @@ bool calculate_output_size(
 
 } // namespace
 
-using Tensor = exec_aten::Tensor;
+using Tensor = executorch::aten::Tensor;
 
 // repeat.out(Tensor self, int[] repeats, *, Tensor(a!) out) -> Tensor(a!)
 Tensor& repeat_out(
     KernelRuntimeContext& ctx,
     const Tensor& self,
-    exec_aten::ArrayRef<int64_t> repeats,
+    executorch::aten::ArrayRef<int64_t> repeats,
     Tensor& out) {
   (void)ctx;
   Tensor::SizesType expected_output_size[kTensorDimensionLimit];
diff --git a/kernels/portable/cpu/op_repeat_interleave.cpp b/kernels/portable/cpu/op_repeat_interleave.cpp
index c36c8deea9..c8a84e8c74 100644
--- a/kernels/portable/cpu/op_repeat_interleave.cpp
+++ b/kernels/portable/cpu/op_repeat_interleave.cpp
@@ -47,12 +47,12 @@ bool check_repeat_interleave_args(
 
 } // namespace
 
-using Tensor = exec_aten::Tensor;
+using Tensor = executorch::aten::Tensor;
 
 Tensor& repeat_interleave_Tensor_out(
     KernelRuntimeContext& ctx,
     const Tensor& repeats,
-    exec_aten::optional<int64_t> output_size,
+    executorch::aten::optional<int64_t> output_size,
     Tensor& out) {
   (void)ctx;
 
@@ -86,7 +86,7 @@ Tensor& repeat_interleave_Tensor_out(
   ET_KERNEL_CHECK_MSG(
       ctx,
       resize_tensor(
-          out, {static_cast<exec_aten::SizesType>(output_size_value)}) ==
+          out, {static_cast<executorch::aten::SizesType>(output_size_value)}) ==
           Error::Ok,
       InvalidArgument,
       out,
diff --git a/kernels/portable/cpu/op_replication_pad1d.cpp b/kernels/portable/cpu/op_replication_pad1d.cpp
index 904e285f1a..0b38c4f154 100644
--- a/kernels/portable/cpu/op_replication_pad1d.cpp
+++ b/kernels/portable/cpu/op_replication_pad1d.cpp
@@ -13,12 +13,12 @@ namespace torch {
 namespace executor {
 namespace native {
 
-using Tensor = exec_aten::Tensor;
+using Tensor = executorch::aten::Tensor;
 
 Tensor& replication_pad1d_out(
     KernelRuntimeContext& ctx,
     const Tensor& in,
-    exec_aten::ArrayRef<int64_t> padding,
+    executorch::aten::ArrayRef<int64_t> padding,
     Tensor& out) {
   (void)ctx;
 
diff --git a/kernels/portable/cpu/op_replication_pad2d.cpp b/kernels/portable/cpu/op_replication_pad2d.cpp
index 1e8fd5b866..e3d79644db 100644
--- a/kernels/portable/cpu/op_replication_pad2d.cpp
+++ b/kernels/portable/cpu/op_replication_pad2d.cpp
@@ -13,12 +13,12 @@ namespace torch {
 namespace executor {
 namespace native {
 
-using Tensor = exec_aten::Tensor;
+using Tensor = executorch::aten::Tensor;
 
 Tensor& replication_pad2d_out(
     KernelRuntimeContext& ctx,
     const Tensor& in,
-    exec_aten::ArrayRef<int64_t> padding,
+    executorch::aten::ArrayRef<int64_t> padding,
     Tensor& out) {
   (void)ctx;
 
diff --git a/kernels/portable/cpu/op_replication_pad3d.cpp b/kernels/portable/cpu/op_replication_pad3d.cpp
index e6b097b2d1..f23bde05be 100644
--- a/kernels/portable/cpu/op_replication_pad3d.cpp
+++ b/kernels/portable/cpu/op_replication_pad3d.cpp
@@ -13,12 +13,12 @@ namespace torch {
 namespace executor {
 namespace native {
 
-using Tensor = exec_aten::Tensor;
+using Tensor = executorch::aten::Tensor;
 
 Tensor& replication_pad3d_out(
     KernelRuntimeContext& ctx,
     const Tensor& in,
-    exec_aten::ArrayRef<int64_t> padding,
+    executorch::aten::ArrayRef<int64_t> padding,
     Tensor& out) {
   (void)ctx;
 
diff --git a/kernels/portable/cpu/op_round.cpp b/kernels/portable/cpu/op_round.cpp
index eb4559fbb9..41ad6ac55b 100644
--- a/kernels/portable/cpu/op_round.cpp
+++ b/kernels/portable/cpu/op_round.cpp
@@ -16,7 +16,7 @@ namespace torch {
 namespace executor {
 namespace native {
 
-using exec_aten::Tensor;
+using executorch::aten::Tensor;
 
 namespace {
 
diff --git a/kernels/portable/cpu/op_scatter.cpp b/kernels/portable/cpu/op_scatter.cpp
index 31e24d9582..4fcf05e49c 100644
--- a/kernels/portable/cpu/op_scatter.cpp
+++ b/kernels/portable/cpu/op_scatter.cpp
@@ -18,8 +18,8 @@ namespace torch {
 namespace executor {
 namespace native {
 
-using Tensor = exec_aten::Tensor;
-using ScalarType = exec_aten::ScalarType;
+using Tensor = executorch::aten::Tensor;
+using ScalarType = executorch::aten::ScalarType;
 
 namespace {
 
diff --git a/kernels/portable/cpu/op_scatter_add.cpp b/kernels/portable/cpu/op_scatter_add.cpp
index ec91594faf..1b53777e73 100644
--- a/kernels/portable/cpu/op_scatter_add.cpp
+++ b/kernels/portable/cpu/op_scatter_add.cpp
@@ -14,8 +14,8 @@ namespace torch {
 namespace executor {
 namespace native {
 
-using Tensor = exec_aten::Tensor;
-using ScalarType = exec_aten::ScalarType;
+using Tensor = executorch::aten::Tensor;
+using ScalarType = executorch::aten::ScalarType;
 
 namespace {
 
diff --git a/kernels/portable/cpu/op_select_copy.cpp b/kernels/portable/cpu/op_select_copy.cpp
index 8df9fd00cb..586ef583d4 100644
--- a/kernels/portable/cpu/op_select_copy.cpp
+++ b/kernels/portable/cpu/op_select_copy.cpp
@@ -15,7 +15,7 @@ namespace torch {
 namespace executor {
 namespace native {
 
-using Tensor = exec_aten::Tensor;
+using Tensor = executorch::aten::Tensor;
 
 Tensor& select_copy_int_out(
     KernelRuntimeContext& ctx,
diff --git a/kernels/portable/cpu/op_select_scatter.cpp b/kernels/portable/cpu/op_select_scatter.cpp
index e25e311eef..e4622d8fda 100644
--- a/kernels/portable/cpu/op_select_scatter.cpp
+++ b/kernels/portable/cpu/op_select_scatter.cpp
@@ -17,7 +17,7 @@ namespace torch {
 namespace executor {
 namespace native {
 
-using Tensor = exec_aten::Tensor;
+using Tensor = executorch::aten::Tensor;
 
 /// aten::select_scatter.out(Tensor self, Tensor src, int dim, SymInt index, *,
 /// Tensor(a!) out) -> Tensor(a!)
diff --git a/kernels/portable/cpu/op_sigmoid.cpp b/kernels/portable/cpu/op_sigmoid.cpp
index 34b2ec60de..09cfed524f 100644
--- a/kernels/portable/cpu/op_sigmoid.cpp
+++ b/kernels/portable/cpu/op_sigmoid.cpp
@@ -16,7 +16,7 @@ namespace torch {
 namespace executor {
 namespace native {
 
-using Tensor = exec_aten::Tensor;
+using Tensor = executorch::aten::Tensor;
 
 Tensor& sigmoid_out(KernelRuntimeContext& ctx, const Tensor& in, Tensor& out) {
   (void)ctx;
diff --git a/kernels/portable/cpu/op_sign.cpp b/kernels/portable/cpu/op_sign.cpp
index a003811461..e694509497 100644
--- a/kernels/portable/cpu/op_sign.cpp
+++ b/kernels/portable/cpu/op_sign.cpp
@@ -17,7 +17,7 @@ namespace torch {
 namespace executor {
 namespace native {
 
-using exec_aten::Tensor;
+using executorch::aten::Tensor;
 
 Tensor& sign_out(KernelRuntimeContext& ctx, const Tensor& in, Tensor& out) {
   (void)ctx;
@@ -36,7 +36,7 @@ Tensor& sign_out(KernelRuntimeContext& ctx, const Tensor& in, Tensor& out) {
   ET_KERNEL_CHECK(
       ctx, tensors_have_same_shape_and_dtype(in, out), InvalidArgument, out);
 
-  if (in.scalar_type() == exec_aten::ScalarType::Bool) {
+  if (in.scalar_type() == executorch::aten::ScalarType::Bool) {
     memcpy(out.mutable_data_ptr(), in.const_data_ptr(), in.nbytes());
   } else {
     ET_SWITCH_REALHBF16_TYPES(in.scalar_type(), ctx, "sign.out", CTYPE, [&] {
diff --git a/kernels/portable/cpu/op_slice_copy.cpp b/kernels/portable/cpu/op_slice_copy.cpp
index 984bbd2997..acffb58b74 100644
--- a/kernels/portable/cpu/op_slice_copy.cpp
+++ b/kernels/portable/cpu/op_slice_copy.cpp
@@ -14,14 +14,14 @@ namespace torch {
 namespace executor {
 namespace native {
 
-using Tensor = exec_aten::Tensor;
+using Tensor = executorch::aten::Tensor;
 
 Tensor& slice_copy_Tensor_out(
     KernelRuntimeContext& ctx,
     const Tensor& in,
     int64_t dim,
-    exec_aten::optional<int64_t> start_val,
-    exec_aten::optional<int64_t> end_val,
+    executorch::aten::optional<int64_t> start_val,
+    executorch::aten::optional<int64_t> end_val,
     int64_t step,
     Tensor& out) {
   (void)ctx;
diff --git a/kernels/portable/cpu/op_slice_scatter.cpp b/kernels/portable/cpu/op_slice_scatter.cpp
index 8b97ff3b3e..c2fe2d7058 100644
--- a/kernels/portable/cpu/op_slice_scatter.cpp
+++ b/kernels/portable/cpu/op_slice_scatter.cpp
@@ -16,15 +16,15 @@ namespace torch {
 namespace executor {
 namespace native {
 
-using Tensor = exec_aten::Tensor;
+using Tensor = executorch::aten::Tensor;
 
 Tensor& slice_scatter_out(
     KernelRuntimeContext& ctx,
     const Tensor& input,
     const Tensor& src,
     int64_t dim,
-    exec_aten::optional<int64_t> start_val,
-    exec_aten::optional<int64_t> end_val,
+    executorch::aten::optional<int64_t> start_val,
+    executorch::aten::optional<int64_t> end_val,
     int64_t step,
     Tensor& out) {
   (void)ctx;
diff --git a/kernels/portable/cpu/op_softmax.cpp b/kernels/portable/cpu/op_softmax.cpp
index 731956fad7..56ffa3c296 100644
--- a/kernels/portable/cpu/op_softmax.cpp
+++ b/kernels/portable/cpu/op_softmax.cpp
@@ -17,7 +17,7 @@ namespace torch {
 namespace executor {
 namespace native {
 
-using Tensor = exec_aten::Tensor;
+using Tensor = executorch::aten::Tensor;
 
 Tensor& softmax_out(
     KernelRuntimeContext& ctx,
diff --git a/kernels/portable/cpu/op_split_copy.cpp b/kernels/portable/cpu/op_split_copy.cpp
index 3e15338925..fdc8972789 100644
--- a/kernels/portable/cpu/op_split_copy.cpp
+++ b/kernels/portable/cpu/op_split_copy.cpp
@@ -16,8 +16,8 @@ namespace torch {
 namespace executor {
 namespace native {
 
-using Tensor = exec_aten::Tensor;
-using TensorList = exec_aten::TensorList;
+using Tensor = executorch::aten::Tensor;
+using TensorList = executorch::aten::TensorList;
 
 /**
  * Splits the tensor into chunks of size `split_size` along the specified
diff --git a/kernels/portable/cpu/op_split_with_sizes_copy.cpp b/kernels/portable/cpu/op_split_with_sizes_copy.cpp
index f6bfffdbf0..daa0845c6f 100644
--- a/kernels/portable/cpu/op_split_with_sizes_copy.cpp
+++ b/kernels/portable/cpu/op_split_with_sizes_copy.cpp
@@ -17,13 +17,13 @@ namespace torch {
 namespace executor {
 namespace native {
 
-using Tensor = exec_aten::Tensor;
-using TensorList = exec_aten::TensorList;
+using Tensor = executorch::aten::Tensor;
+using TensorList = executorch::aten::TensorList;
 
 void split_with_sizes_copy_out(
     KernelRuntimeContext& ctx,
     const Tensor& in,
-    exec_aten::ArrayRef<int64_t> split_sizes,
+    executorch::aten::ArrayRef<int64_t> split_sizes,
     int64_t dim,
     TensorList out) {
   (void)ctx;
diff --git a/kernels/portable/cpu/op_squeeze_copy.cpp b/kernels/portable/cpu/op_squeeze_copy.cpp
index 82ff1016e2..0a5750fa35 100644
--- a/kernels/portable/cpu/op_squeeze_copy.cpp
+++ b/kernels/portable/cpu/op_squeeze_copy.cpp
@@ -17,7 +17,7 @@ namespace torch {
 namespace executor {
 namespace native {
 
-using Tensor = exec_aten::Tensor;
+using Tensor = executorch::aten::Tensor;
 
 Tensor& squeeze_copy_dim_out(
     KernelRuntimeContext& ctx,
@@ -60,7 +60,7 @@ Tensor& squeeze_copy_dim_out(
 Tensor& squeeze_copy_dims_out(
     KernelRuntimeContext& ctx,
     const Tensor& in,
-    exec_aten::ArrayRef<int64_t> dims,
+    executorch::aten::ArrayRef<int64_t> dims,
     Tensor& out) {
   (void)ctx;
 
diff --git a/kernels/portable/cpu/op_stack.cpp b/kernels/portable/cpu/op_stack.cpp
index e026a47fe6..3dcb0b5e75 100644
--- a/kernels/portable/cpu/op_stack.cpp
+++ b/kernels/portable/cpu/op_stack.cpp
@@ -15,11 +15,11 @@ namespace torch {
 namespace executor {
 namespace native {
 
-using Tensor = exec_aten::Tensor;
+using Tensor = executorch::aten::Tensor;
 
 Tensor& stack_out(
     KernelRuntimeContext& ctx,
-    exec_aten::ArrayRef<Tensor> tensors,
+    executorch::aten::ArrayRef<Tensor> tensors,
     int64_t dim,
     Tensor& out) {
   (void)ctx;
diff --git a/kernels/portable/cpu/op_sum.cpp b/kernels/portable/cpu/op_sum.cpp
index f5888018ae..0fec3e37f2 100644
--- a/kernels/portable/cpu/op_sum.cpp
+++ b/kernels/portable/cpu/op_sum.cpp
@@ -14,8 +14,8 @@ namespace torch {
 namespace executor {
 namespace native {
 
-using Tensor = exec_aten::Tensor;
-using ScalarType = exec_aten::ScalarType;
+using Tensor = executorch::aten::Tensor;
+using ScalarType = executorch::aten::ScalarType;
 
 Tensor& sum_dim_out(
     KernelRuntimeContext& ctx,
diff --git a/kernels/portable/cpu/op_t_copy.cpp b/kernels/portable/cpu/op_t_copy.cpp
index 4dd7622888..677d151e56 100644
--- a/kernels/portable/cpu/op_t_copy.cpp
+++ b/kernels/portable/cpu/op_t_copy.cpp
@@ -14,9 +14,9 @@ namespace torch {
 namespace executor {
 namespace native {
 
-using SizesType = exec_aten::SizesType;
-using StridesType = exec_aten::StridesType;
-using Tensor = exec_aten::Tensor;
+using SizesType = executorch::aten::SizesType;
+using StridesType = executorch::aten::StridesType;
+using Tensor = executorch::aten::Tensor;
 
 /**
  * Expects input to be <= 2-D tensor and transposes dimensions 0 and 1.
diff --git a/kernels/portable/cpu/op_to_copy.cpp b/kernels/portable/cpu/op_to_copy.cpp
index 586691a52e..25d2b51b3b 100644
--- a/kernels/portable/cpu/op_to_copy.cpp
+++ b/kernels/portable/cpu/op_to_copy.cpp
@@ -14,7 +14,7 @@ namespace torch {
 namespace executor {
 namespace native {
 
-using Tensor = exec_aten::Tensor;
+using Tensor = executorch::aten::Tensor;
 
 template <typename SELF_CTYPE, typename OUT_CTYPE>
 void _to_impl(const Tensor& self, Tensor& out) {
@@ -32,7 +32,7 @@ Tensor& to_copy_out(
     KernelRuntimeContext& ctx,
     const Tensor& self,
     bool non_blocking,
-    exec_aten::optional<exec_aten::MemoryFormat> memory_format,
+    executorch::aten::optional<executorch::aten::MemoryFormat> memory_format,
     Tensor& out) {
   ET_KERNEL_CHECK(
       ctx,
diff --git a/kernels/portable/cpu/op_transpose_copy.cpp b/kernels/portable/cpu/op_transpose_copy.cpp
index 616cec51a6..f273084432 100644
--- a/kernels/portable/cpu/op_transpose_copy.cpp
+++ b/kernels/portable/cpu/op_transpose_copy.cpp
@@ -13,9 +13,9 @@ namespace torch {
 namespace executor {
 namespace native {
 
-using SizesType = exec_aten::SizesType;
-using StridesType = exec_aten::StridesType;
-using Tensor = exec_aten::Tensor;
+using SizesType = executorch::aten::SizesType;
+using StridesType = executorch::aten::StridesType;
+using Tensor = executorch::aten::Tensor;
 
 /**
  * Swaps dimension 'dim0' of 'a' with 'dim1', and copying
diff --git a/kernels/portable/cpu/op_tril.cpp b/kernels/portable/cpu/op_tril.cpp
index f5fb172753..9e28cff825 100644
--- a/kernels/portable/cpu/op_tril.cpp
+++ b/kernels/portable/cpu/op_tril.cpp
@@ -14,8 +14,8 @@ namespace torch {
 namespace executor {
 namespace native {
 
-using exec_aten::Scalar;
-using ScalarType = exec_aten::ScalarType;
+using executorch::aten::Scalar;
+using ScalarType = executorch::aten::ScalarType;
 
 namespace {
 
diff --git a/kernels/portable/cpu/op_unbind_copy.cpp b/kernels/portable/cpu/op_unbind_copy.cpp
index ba33b87e75..b8ab1e489f 100644
--- a/kernels/portable/cpu/op_unbind_copy.cpp
+++ b/kernels/portable/cpu/op_unbind_copy.cpp
@@ -16,8 +16,8 @@ namespace torch {
 namespace executor {
 namespace native {
 
-using Tensor = exec_aten::Tensor;
-using TensorList = exec_aten::TensorList;
+using Tensor = executorch::aten::Tensor;
+using TensorList = executorch::aten::TensorList;
 
 /**
  * unbind_copy.int_out(Tensor input, int dim=0, *, Tensor(a!)[] out) -> ()
diff --git a/kernels/portable/cpu/op_unsqueeze_copy.cpp b/kernels/portable/cpu/op_unsqueeze_copy.cpp
index 3b74033d9d..e6eec2e891 100644
--- a/kernels/portable/cpu/op_unsqueeze_copy.cpp
+++ b/kernels/portable/cpu/op_unsqueeze_copy.cpp
@@ -16,7 +16,7 @@ namespace torch {
 namespace executor {
 namespace native {
 
-using Tensor = exec_aten::Tensor;
+using Tensor = executorch::aten::Tensor;
 
 // unsqueeze_copy.out(Tensor self, int dim, *, Tensor(a!) out) -> Tensor(a!)
 // -> Tensor(a!)
diff --git a/kernels/portable/cpu/op_upsample_bilinear2d.cpp b/kernels/portable/cpu/op_upsample_bilinear2d.cpp
index 931a170588..c30abe5e33 100644
--- a/kernels/portable/cpu/op_upsample_bilinear2d.cpp
+++ b/kernels/portable/cpu/op_upsample_bilinear2d.cpp
@@ -13,9 +13,9 @@ namespace torch {
 namespace executor {
 namespace native {
 
-using exec_aten::ArrayRef;
-using exec_aten::optional;
-using exec_aten::SizesType;
+using executorch::aten::ArrayRef;
+using executorch::aten::optional;
+using executorch::aten::SizesType;
 
 namespace {
 template <typename CTYPE>
@@ -93,9 +93,9 @@ void upsample_bilinear2d_kernel_impl(
 Tensor& upsample_bilinear2d_vec_out(
     KernelRuntimeContext& ctx,
     const Tensor& in,
-    const exec_aten::OptionalArrayRef<int64_t> output_size,
+    const executorch::aten::OptionalArrayRef<int64_t> output_size,
     bool align_corners,
-    const exec_aten::OptionalArrayRef<double> scale_factors,
+    const executorch::aten::OptionalArrayRef<double> scale_factors,
     Tensor& out) {
   // Preconditions (checked in check_..._args):
   //  In and out tensors have same dtype.
diff --git a/kernels/portable/cpu/op_upsample_nearest2d.cpp b/kernels/portable/cpu/op_upsample_nearest2d.cpp
index 43ab32707d..be51483491 100644
--- a/kernels/portable/cpu/op_upsample_nearest2d.cpp
+++ b/kernels/portable/cpu/op_upsample_nearest2d.cpp
@@ -13,9 +13,9 @@ namespace torch {
 namespace executor {
 namespace native {
 
-using exec_aten::ArrayRef;
-using exec_aten::optional;
-using exec_aten::SizesType;
+using executorch::aten::ArrayRef;
+using executorch::aten::optional;
+using executorch::aten::SizesType;
 
 namespace {
 template <typename CTYPE>
@@ -51,8 +51,8 @@ void upsample_nearest2d_kernel_impl(
 Tensor& upsample_nearest2d_vec_out(
     KernelRuntimeContext& ctx,
     const Tensor& in,
-    const exec_aten::OptionalArrayRef<int64_t> output_size,
-    const exec_aten::OptionalArrayRef<double> scale_factors,
+    const executorch::aten::OptionalArrayRef<int64_t> output_size,
+    const executorch::aten::OptionalArrayRef<double> scale_factors,
     Tensor& out) {
   // Preconditions (checked in check_..._args):
   //  In and out tensors have same dtype.
diff --git a/kernels/portable/cpu/op_view_copy.cpp b/kernels/portable/cpu/op_view_copy.cpp
index 323786197f..cf17388244 100644
--- a/kernels/portable/cpu/op_view_copy.cpp
+++ b/kernels/portable/cpu/op_view_copy.cpp
@@ -16,13 +16,13 @@ namespace torch {
 namespace executor {
 namespace native {
 
-using Tensor = exec_aten::Tensor;
+using Tensor = executorch::aten::Tensor;
 
 // view_copy.out(Tensor self, int[] size, *, Tensor(a!) out) -> Tensor(a!)
 Tensor& view_copy_out(
     KernelRuntimeContext& ctx,
     const Tensor& self,
-    exec_aten::ArrayRef<int64_t> size_int64_t,
+    executorch::aten::ArrayRef<int64_t> size_int64_t,
     Tensor& out) {
   (void)ctx;
 
diff --git a/kernels/portable/cpu/op_zeros.cpp b/kernels/portable/cpu/op_zeros.cpp
index fcb1e0a798..e24324e55f 100644
--- a/kernels/portable/cpu/op_zeros.cpp
+++ b/kernels/portable/cpu/op_zeros.cpp
@@ -16,13 +16,13 @@ namespace torch {
 namespace executor {
 namespace native {
 
-using exec_aten::Tensor;
+using executorch::aten::Tensor;
 
 namespace {
 
 bool check_sizes(
-    exec_aten::ArrayRef<int64_t> size_int64_t,
-    exec_aten::ArrayRef<int32_t> size_int32_t) {
+    executorch::aten::ArrayRef<int64_t> size_int64_t,
+    executorch::aten::ArrayRef<int32_t> size_int32_t) {
   ET_LOG_AND_RETURN_IF_FALSE(size_int64_t.size() == size_int32_t.size());
   for (int i = 0; i < size_int64_t.size(); i++) {
     ET_LOG_AND_RETURN_IF_FALSE(((int64_t)size_int32_t[i] == size_int64_t[i]));
diff --git a/kernels/portable/cpu/pattern/unary_ufunc_realhb_to_bool.cpp b/kernels/portable/cpu/pattern/unary_ufunc_realhb_to_bool.cpp
index 3be4b258cc..367137ad02 100644
--- a/kernels/portable/cpu/pattern/unary_ufunc_realhb_to_bool.cpp
+++ b/kernels/portable/cpu/pattern/unary_ufunc_realhb_to_bool.cpp
@@ -32,7 +32,7 @@ Tensor& unary_ufunc_realhb_to_bool(
 
   ET_KERNEL_CHECK_MSG(
       ctx,
-      out.scalar_type() == exec_aten::ScalarType::Bool,
+      out.scalar_type() == executorch::aten::ScalarType::Bool,
       InvalidArgument,
       out,
       "Expected out tensor to have dtype Bool, but got %" PRId8 " instead.",
diff --git a/kernels/portable/cpu/scalar_utils.h b/kernels/portable/cpu/scalar_utils.h
index 3d6dfb75e4..0270080481 100644
--- a/kernels/portable/cpu/scalar_utils.h
+++ b/kernels/portable/cpu/scalar_utils.h
@@ -116,12 +116,12 @@ struct promote_type_with_scalar_type {
           (std::is_same<
                promote_type_with_scalar_type_not_respecting_half_to_float,
                typename ScalarTypeToCppType<
-                   exec_aten::ScalarType::Half>::type>::value ||
+                   executorch::aten::ScalarType::Half>::type>::value ||
            std::is_same<
                promote_type_with_scalar_type_not_respecting_half_to_float,
                typename ScalarTypeToCppType<
-                   exec_aten::ScalarType::BFloat16>::type>::value),
-      typename ScalarTypeToCppType<exec_aten::ScalarType::Float>::type,
+                   executorch::aten::ScalarType::BFloat16>::type>::value),
+      typename ScalarTypeToCppType<executorch::aten::ScalarType::Float>::type,
       promote_type_with_scalar_type_not_respecting_half_to_float>::type;
 };
 
diff --git a/kernels/portable/cpu/selective_build.h b/kernels/portable/cpu/selective_build.h
index 04b6b28f06..be8cee0c85 100644
--- a/kernels/portable/cpu/selective_build.h
+++ b/kernels/portable/cpu/selective_build.h
@@ -18,7 +18,7 @@
 // dummy implementation
 inline constexpr bool should_include_kernel_dtype(
     const char* /*operator_name*/,
-    exec_aten::ScalarType /*scalar_type*/
+    executorch::aten::ScalarType /*scalar_type*/
 ) {
   return true;
 }
diff --git a/kernels/portable/cpu/test/scalar_utils_test.cpp b/kernels/portable/cpu/test/scalar_utils_test.cpp
index c0fa932343..f1e7af6c24 100644
--- a/kernels/portable/cpu/test/scalar_utils_test.cpp
+++ b/kernels/portable/cpu/test/scalar_utils_test.cpp
@@ -16,7 +16,7 @@ struct promote_type_with_scalar_type_is_valid
           (std::is_same<T2, torch::executor::internal::B1>::value ||
            std::is_same<T2, torch::executor::internal::I8>::value ||
            std::is_same<T2, torch::executor::internal::F8>::value) &&
-              !std::is_same<T1, exec_aten::BFloat16>::value &&
+              !std::is_same<T1, executorch::aten::BFloat16>::value &&
               !torch::executor::is_qint_type<T1>::value &&
               !torch::executor::is_bits_type<T1>::value &&
               !executorch::runtime::is_bits_type<T2>::value &&
diff --git a/kernels/portable/cpu/util/activation_ops_util.cpp b/kernels/portable/cpu/util/activation_ops_util.cpp
index 273f5d5959..195ffaf66d 100644
--- a/kernels/portable/cpu/util/activation_ops_util.cpp
+++ b/kernels/portable/cpu/util/activation_ops_util.cpp
@@ -77,7 +77,7 @@ bool check_softmax_args(
 }
 
 Error resize_glu_out(const Tensor& in, int64_t dim, Tensor& out) {
-  exec_aten::SizesType expected_output_size[kTensorDimensionLimit];
+  executorch::aten::SizesType expected_output_size[kTensorDimensionLimit];
 
   const size_t non_negative_dim = dim < 0 ? dim + in.dim() : dim;
   for (size_t i = 0; i < in.dim(); i++) {
@@ -85,7 +85,7 @@ Error resize_glu_out(const Tensor& in, int64_t dim, Tensor& out) {
         (i == non_negative_dim) ? (in.size(i) / 2) : in.size(i);
   }
 
-  ArrayRef<exec_aten::SizesType> output_size{
+  ArrayRef<executorch::aten::SizesType> output_size{
       expected_output_size, static_cast<size_t>(out.dim())};
 
   return resize_tensor(out, output_size);
diff --git a/kernels/portable/cpu/util/advanced_index_util.cpp b/kernels/portable/cpu/util/advanced_index_util.cpp
index 1c7cc909b1..e2eabec4bc 100644
--- a/kernels/portable/cpu/util/advanced_index_util.cpp
+++ b/kernels/portable/cpu/util/advanced_index_util.cpp
@@ -12,8 +12,9 @@
 namespace torch {
 namespace executor {
 
-using Tensor = exec_aten::Tensor;
-using TensorOptList = exec_aten::ArrayRef<exec_aten::optional<Tensor>>;
+using Tensor = executorch::aten::Tensor;
+using TensorOptList =
+    executorch::aten::ArrayRef<executorch::aten::optional<Tensor>>;
 
 namespace {
 
diff --git a/kernels/portable/cpu/util/advanced_index_util.h b/kernels/portable/cpu/util/advanced_index_util.h
index ed57d5091a..78b23a5e77 100644
--- a/kernels/portable/cpu/util/advanced_index_util.h
+++ b/kernels/portable/cpu/util/advanced_index_util.h
@@ -13,8 +13,9 @@
 namespace torch {
 namespace executor {
 
-using Tensor = exec_aten::Tensor;
-using TensorOptList = exec_aten::ArrayRef<exec_aten::optional<Tensor>>;
+using Tensor = executorch::aten::Tensor;
+using TensorOptList =
+    executorch::aten::ArrayRef<executorch::aten::optional<Tensor>>;
 
 /**
  * Performs preliminary checks on the arguments. However, it doesn't check that
diff --git a/kernels/portable/cpu/util/broadcast_util.cpp b/kernels/portable/cpu/util/broadcast_util.cpp
index 943219490b..ee0f6902cb 100644
--- a/kernels/portable/cpu/util/broadcast_util.cpp
+++ b/kernels/portable/cpu/util/broadcast_util.cpp
@@ -15,8 +15,8 @@
 namespace torch {
 namespace executor {
 
-using Tensor = exec_aten::Tensor;
-using ScalarType = exec_aten::ScalarType;
+using Tensor = executorch::aten::Tensor;
+using ScalarType = executorch::aten::ScalarType;
 
 void free_broadcast_tensor(const Tensor& broadcast_tensor) {
   free((void*)broadcast_tensor.const_data_ptr());
@@ -76,8 +76,8 @@ Tensor make_tensor(
 } // namespace
 
 bool tensor_is_broadcastable_to(
-    const exec_aten::ArrayRef<Tensor::SizesType> broadcast_from_shape,
-    const exec_aten::ArrayRef<Tensor::SizesType> broadcast_to_shape) {
+    const executorch::aten::ArrayRef<Tensor::SizesType> broadcast_from_shape,
+    const executorch::aten::ArrayRef<Tensor::SizesType> broadcast_to_shape) {
   bool feasible_bcast = true;
 
   if (broadcast_to_shape.size() < broadcast_from_shape.size()) {
@@ -108,8 +108,8 @@ bool tensor_is_broadcastable_to(
 }
 
 bool tensors_are_broadcastable_between(
-    const exec_aten::ArrayRef<Tensor::SizesType> a_shape,
-    const exec_aten::ArrayRef<Tensor::SizesType> b_shape) {
+    const executorch::aten::ArrayRef<Tensor::SizesType> a_shape,
+    const executorch::aten::ArrayRef<Tensor::SizesType> b_shape) {
   auto a_dim = a_shape.size();
   auto b_dim = b_shape.size();
 
@@ -208,8 +208,8 @@ Tensor broadcast_tensor(
 }
 
 ET_NODISCARD Error get_broadcast_target_size(
-    const exec_aten::ArrayRef<Tensor::SizesType> a_size,
-    const exec_aten::ArrayRef<Tensor::SizesType> b_size,
+    const executorch::aten::ArrayRef<Tensor::SizesType> a_size,
+    const executorch::aten::ArrayRef<Tensor::SizesType> b_size,
     Tensor::SizesType* out_sizes,
     const size_t out_sizes_len,
     size_t* out_dim) {
@@ -260,7 +260,7 @@ ET_NODISCARD Error get_broadcast_target_size(
 
 void delinearize_index(
     size_t linear_index,
-    exec_aten::ArrayRef<Tensor::SizesType> shape,
+    executorch::aten::ArrayRef<Tensor::SizesType> shape,
     size_t* out_indexes,
     const size_t out_indexes_len) {
   ET_CHECK(shape.size() <= out_indexes_len);
@@ -283,8 +283,8 @@ void delinearize_index(
 size_t linearize_access_indexes(
     ArrayRef<size_t> indexes_broadcast_to,
     ssize_t broadcast_to_ndim,
-    exec_aten::ArrayRef<Tensor::SizesType> broadcast_from_shape,
-    exec_aten::ArrayRef<Tensor::StridesType> broadcast_from_strides) {
+    executorch::aten::ArrayRef<Tensor::SizesType> broadcast_from_shape,
+    executorch::aten::ArrayRef<Tensor::StridesType> broadcast_from_strides) {
   size_t num_skip_dims = broadcast_to_ndim - broadcast_from_shape.size();
   ArrayRef<size_t> indexes_broadcast_from = indexes_broadcast_to.slice(
       num_skip_dims, broadcast_to_ndim - num_skip_dims);
diff --git a/kernels/portable/cpu/util/broadcast_util.h b/kernels/portable/cpu/util/broadcast_util.h
index 92d35f322f..3534434524 100644
--- a/kernels/portable/cpu/util/broadcast_util.h
+++ b/kernels/portable/cpu/util/broadcast_util.h
@@ -24,8 +24,8 @@ namespace executor {
  *
  */
 bool tensor_is_broadcastable_to(
-    const exec_aten::ArrayRef<Tensor::SizesType> broadcast_from_shape,
-    const exec_aten::ArrayRef<Tensor::SizesType> broadcast_to_shape);
+    const executorch::aten::ArrayRef<Tensor::SizesType> broadcast_from_shape,
+    const executorch::aten::ArrayRef<Tensor::SizesType> broadcast_to_shape);
 
 /**
  * Check whether or not the broadcast_from tensor should and can be broadcasted
@@ -50,8 +50,8 @@ bool tensor_is_broadcastable_to(
  * @returns true if the tensors are broadcastable, false otherwise.
  */
 bool tensors_are_broadcastable_between(
-    const exec_aten::ArrayRef<Tensor::SizesType> a_shape,
-    const exec_aten::ArrayRef<Tensor::SizesType> b_shape);
+    const executorch::aten::ArrayRef<Tensor::SizesType> a_shape,
+    const executorch::aten::ArrayRef<Tensor::SizesType> b_shape);
 
 /**
  * Convenience overload of the above function to accept Tensor inputs.
@@ -77,9 +77,9 @@ bool tensors_are_broadcastable_between(const Tensor& a, const Tensor& b);
  * repeated as appropriate. This tensor contains dynamically allocated memory
  * and must be freed using free_broadcast_tensor.
  */
-ET_DEPRECATED exec_aten::Tensor broadcast_tensor(
-    const exec_aten::Tensor& broadcast_from,
-    const exec_aten::Tensor& broadcast_to);
+ET_DEPRECATED executorch::aten::Tensor broadcast_tensor(
+    const executorch::aten::Tensor& broadcast_from,
+    const executorch::aten::Tensor& broadcast_to);
 
 /**
  * Get the size of the target tensor that two input tensors would be broadcasted
@@ -98,8 +98,8 @@ ET_DEPRECATED exec_aten::Tensor broadcast_tensor(
  * tensor
  */
 ET_NODISCARD Error get_broadcast_target_size(
-    const exec_aten::ArrayRef<Tensor::SizesType> a_size,
-    const exec_aten::ArrayRef<Tensor::SizesType> b_size,
+    const executorch::aten::ArrayRef<Tensor::SizesType> a_size,
+    const executorch::aten::ArrayRef<Tensor::SizesType> b_size,
     Tensor::SizesType* out_sizes,
     const size_t out_sizes_len,
     size_t* out_dim);
@@ -203,7 +203,7 @@ ET_NODISCARD inline Error resize_to_broadcast_target_size(
  * @returns void
  */
 ET_DEPRECATED void free_broadcast_tensor(
-    const exec_aten::Tensor& broadcast_tensor);
+    const executorch::aten::Tensor& broadcast_tensor);
 
 /**
  * Delinearize a flattened index to per-dimension indexes.
@@ -216,7 +216,7 @@ ET_DEPRECATED void free_broadcast_tensor(
  */
 void delinearize_index(
     size_t linear_index,
-    exec_aten::ArrayRef<Tensor::SizesType> shape,
+    executorch::aten::ArrayRef<Tensor::SizesType> shape,
     size_t* out_indexes,
     const size_t out_indexes_len);
 
@@ -249,8 +249,8 @@ void delinearize_index(
 size_t linearize_access_indexes(
     ArrayRef<size_t> indexes_broadcast_to,
     ssize_t broadcast_to_ndim,
-    exec_aten::ArrayRef<Tensor::SizesType> broadcast_from_shape,
-    exec_aten::ArrayRef<Tensor::StridesType> broadcast_from_strides);
+    executorch::aten::ArrayRef<Tensor::SizesType> broadcast_from_shape,
+    executorch::aten::ArrayRef<Tensor::StridesType> broadcast_from_strides);
 
 /**
  * Return the linear index for broatcast_from tensor, given the indexes of
diff --git a/kernels/portable/cpu/util/copy_ops_util.cpp b/kernels/portable/cpu/util/copy_ops_util.cpp
index a46ccbf241..78b66b05f2 100644
--- a/kernels/portable/cpu/util/copy_ops_util.cpp
+++ b/kernels/portable/cpu/util/copy_ops_util.cpp
@@ -15,7 +15,7 @@
 namespace torch {
 namespace executor {
 
-using Tensor = exec_aten::Tensor;
+using Tensor = executorch::aten::Tensor;
 
 namespace {
 
@@ -72,7 +72,7 @@ bool check_as_strided_copy_args(
 }
 
 bool check_cat_args(
-    exec_aten::ArrayRef<Tensor> tensors,
+    executorch::aten::ArrayRef<Tensor> tensors,
     int64_t dim,
     Tensor& out) {
   // Ensure the input tensors list is non-empty
@@ -123,9 +123,9 @@ bool check_cat_args(
 }
 
 void get_cat_out_target_size(
-    exec_aten::ArrayRef<Tensor> tensors,
+    executorch::aten::ArrayRef<Tensor> tensors,
     int64_t dim,
-    exec_aten::SizesType* out_sizes,
+    executorch::aten::SizesType* out_sizes,
     size_t* out_ndim) {
   // Find the first non-1D-or-empty tensor in the list to use as a reference
   // because an 1D empty tensor is a wildcard and should be ignored when we
@@ -181,9 +181,9 @@ bool check_expand_copy_args(
 }
 
 bool get_expand_copy_out_target_size(
-    exec_aten::ArrayRef<exec_aten::SizesType> self_sizes,
-    exec_aten::ArrayRef<int64_t> expand_sizes,
-    exec_aten::SizesType* output_sizes,
+    executorch::aten::ArrayRef<executorch::aten::SizesType> self_sizes,
+    executorch::aten::ArrayRef<int64_t> expand_sizes,
+    executorch::aten::SizesType* output_sizes,
     size_t* output_rank) {
   auto j{expand_sizes.size()};
   *output_rank = 0;
@@ -305,7 +305,7 @@ bool check_unbind_copy_args(const Tensor& in, int64_t dim, TensorList out) {
 void get_permute_copy_out_target_size(
     const Tensor& in,
     IntArrayRef dims,
-    exec_aten::SizesType* out_sizes,
+    executorch::aten::SizesType* out_sizes,
     size_t* out_ndim) {
   *out_ndim = in.dim();
 
@@ -343,10 +343,10 @@ bool check_pixel_unshuffle_args(
 void get_pixel_shuffle_out_target_size(
     const Tensor& in,
     int64_t upscale_factor,
-    exec_aten::SizesType* out_sizes,
+    executorch::aten::SizesType* out_sizes,
     size_t* out_ndim) {
   *out_ndim = in.dim();
-  const exec_aten::SizesType casted_upscale_factor = upscale_factor;
+  const executorch::aten::SizesType casted_upscale_factor = upscale_factor;
 
   size_t i = 0;
   for (; i < in.dim() - 3; ++i) {
@@ -365,10 +365,10 @@ void get_pixel_shuffle_out_target_size(
 void get_pixel_unshuffle_out_target_size(
     const Tensor& in,
     int64_t downscale_factor,
-    exec_aten::SizesType* out_sizes,
+    executorch::aten::SizesType* out_sizes,
     size_t* out_ndim) {
   *out_ndim = in.dim();
-  const exec_aten::SizesType casted_factor = downscale_factor;
+  const executorch::aten::SizesType casted_factor = downscale_factor;
 
   size_t i = 0;
   for (; i < in.dim() - 3; ++i) {
@@ -400,7 +400,7 @@ bool check_select_copy_out_args(
 void get_select_copy_out_target_size(
     const Tensor& in,
     int64_t dim,
-    exec_aten::SizesType* out_sizes,
+    executorch::aten::SizesType* out_sizes,
     size_t* out_ndim) {
   *out_ndim = in.dim() - 1;
 
@@ -415,7 +415,7 @@ void get_select_copy_out_target_size(
 
 bool check_split_with_sizes_copy_args(
     const Tensor& in,
-    exec_aten::ArrayRef<int64_t> split_sizes,
+    executorch::aten::ArrayRef<int64_t> split_sizes,
     int64_t dim,
     TensorList out) {
   ET_LOG_AND_RETURN_IF_FALSE(tensor_has_rank_greater_or_equal_to(in, 1));
@@ -444,7 +444,7 @@ void get_split_with_sizes_copy_out_target_size(
     const Tensor& in,
     int64_t split_size,
     int64_t dim,
-    exec_aten::SizesType* out_sizes,
+    executorch::aten::SizesType* out_sizes,
     size_t* out_ndim) {
   *out_ndim = in.dim();
 
@@ -467,7 +467,7 @@ bool check_squeeze_copy_dim_args(
 void get_squeeze_copy_dim_out_target_size(
     const Tensor in,
     int64_t dim,
-    exec_aten::SizesType* out_sizes,
+    executorch::aten::SizesType* out_sizes,
     size_t* out_ndim) {
   // For 0 dim tensors, the output should also be 0 dim.
   if (in.dim() == 0) {
@@ -493,7 +493,7 @@ void get_squeeze_copy_dim_out_target_size(
 
 bool check_squeeze_copy_dims_args(
     const Tensor in,
-    const exec_aten::ArrayRef<int64_t> dims,
+    const executorch::aten::ArrayRef<int64_t> dims,
     const Tensor out) {
   ET_LOG_AND_RETURN_IF_FALSE(tensors_have_same_dtype(in, out));
 
@@ -519,8 +519,8 @@ bool check_squeeze_copy_dims_args(
 
 void get_squeeze_copy_dims_out_target_size(
     const Tensor in,
-    const exec_aten::ArrayRef<int64_t> dims,
-    exec_aten::SizesType* out_sizes,
+    const executorch::aten::ArrayRef<int64_t> dims,
+    executorch::aten::SizesType* out_sizes,
     size_t* out_ndim) {
   // For 0 dim tensors, the output should also be 0 dim.
   if (in.dim() == 0) {
@@ -529,7 +529,7 @@ void get_squeeze_copy_dims_out_target_size(
   }
 
   // A dim is only removed if the size at the given dim is 1.
-  exec_aten::SizesType dims_to_remove = 0;
+  executorch::aten::SizesType dims_to_remove = 0;
   for (size_t i = 0; i < dims.size(); ++i) {
     int64_t dim = dims[i] < 0 ? dims[i] + nonzero_dim(in) : dims[i];
     if (in.size(dim) == 1) {
@@ -556,7 +556,7 @@ void get_squeeze_copy_dims_out_target_size(
 }
 
 bool check_stack_args(
-    exec_aten::ArrayRef<Tensor> tensors,
+    executorch::aten::ArrayRef<Tensor> tensors,
     int64_t dim,
     Tensor& out) {
   // Ensure the input tensors list is non-empty
@@ -584,9 +584,9 @@ bool check_stack_args(
 }
 
 void get_stack_out_target_size(
-    exec_aten::ArrayRef<Tensor> tensors,
+    executorch::aten::ArrayRef<Tensor> tensors,
     int64_t dim,
-    exec_aten::SizesType* out_sizes,
+    executorch::aten::SizesType* out_sizes,
     size_t* out_ndim) {
   *out_ndim = tensors[0].dim() + 1;
 
@@ -730,7 +730,7 @@ bool check_split_copy_args(
 bool check_to_copy_args(
     const Tensor& input,
     bool non_blocking,
-    exec_aten::optional<exec_aten::MemoryFormat> memory_format,
+    executorch::aten::optional<executorch::aten::MemoryFormat> memory_format,
     Tensor& out) {
   (void)input;
   (void)out;
@@ -750,13 +750,13 @@ bool check_to_copy_args(
 bool check__to_dim_order_copy_args(
     const Tensor& input,
     bool non_blocking,
-    exec_aten::OptionalArrayRef<int64_t> dim_order,
+    executorch::aten::OptionalArrayRef<int64_t> dim_order,
     Tensor& out) {
   // Right now we only support blocking data transfer
   ET_LOG_AND_RETURN_IF_FALSE(non_blocking == false);
 
   if (dim_order.has_value()) {
-    exec_aten::ArrayRef<int64_t> dim_order_ref = dim_order.value();
+    executorch::aten::ArrayRef<int64_t> dim_order_ref = dim_order.value();
 
     // dim order size shall equal to input dim
     ET_LOG_AND_RETURN_IF_FALSE(dim_order_ref.size() == input.dim());
@@ -843,7 +843,7 @@ bool check_unsqueeze_copy_args(
 
 bool check_view_copy_args(
     const Tensor& self,
-    exec_aten::ArrayRef<int64_t> size_int64_t,
+    executorch::aten::ArrayRef<int64_t> size_int64_t,
     Tensor& out) {
   ET_LOG_AND_RETURN_IF_FALSE(size_int64_t.size() == out.sizes().size());
 
@@ -874,9 +874,9 @@ bool check_view_copy_args(
 
 bool get_view_copy_target_size(
     const Tensor input,
-    exec_aten::ArrayRef<int64_t> size_int64_t,
+    executorch::aten::ArrayRef<int64_t> size_int64_t,
     int64_t dim,
-    exec_aten::SizesType* out_sizes) {
+    executorch::aten::SizesType* out_sizes) {
   size_t out_numels_without_minus_1 = 1;
   int32_t minus_1_dim = -1;
 
@@ -884,7 +884,7 @@ bool get_view_copy_target_size(
 
   for (size_t i = 0; i < dim; ++i) {
     if (size_int64_t[i] != -1) {
-      out_sizes[i] = static_cast<exec_aten::SizesType>(size_int64_t[i]);
+      out_sizes[i] = static_cast<executorch::aten::SizesType>(size_int64_t[i]);
       out_numels_without_minus_1 = out_numels_without_minus_1 * size_int64_t[i];
     } else {
       // TODO(kimishpatel): Add test to hit this line
@@ -924,7 +924,7 @@ void get_diagonal_copy_out_target_size(
     int64_t offset,
     int64_t dim1,
     int64_t dim2,
-    exec_aten::SizesType* out_sizes,
+    executorch::aten::SizesType* out_sizes,
     size_t* out_ndim) {
   *out_ndim = in.dim() - 1;
 
diff --git a/kernels/portable/cpu/util/copy_ops_util.h b/kernels/portable/cpu/util/copy_ops_util.h
index 91c62e707e..8efd6057db 100644
--- a/kernels/portable/cpu/util/copy_ops_util.h
+++ b/kernels/portable/cpu/util/copy_ops_util.h
@@ -70,14 +70,14 @@ void as_strided_copy(
 }
 
 bool check_cat_args(
-    exec_aten::ArrayRef<Tensor> tensors,
+    executorch::aten::ArrayRef<Tensor> tensors,
     int64_t dim,
     Tensor& out);
 
 void get_cat_out_target_size(
-    exec_aten::ArrayRef<Tensor> tensors,
+    executorch::aten::ArrayRef<Tensor> tensors,
     int64_t dim,
-    exec_aten::SizesType* out_sizes,
+    executorch::aten::SizesType* out_sizes,
     size_t* out_ndim);
 
 bool check_expand_copy_args(
@@ -87,9 +87,9 @@ bool check_expand_copy_args(
     Tensor& out);
 
 bool get_expand_copy_out_target_size(
-    exec_aten::ArrayRef<exec_aten::SizesType> self_sizes,
-    exec_aten::ArrayRef<int64_t> expand_sizes,
-    exec_aten::SizesType* output_sizes,
+    executorch::aten::ArrayRef<executorch::aten::SizesType> self_sizes,
+    executorch::aten::ArrayRef<int64_t> expand_sizes,
+    executorch::aten::SizesType* output_sizes,
     size_t* output_rank);
 
 bool check_permute_copy_args(const Tensor& in, IntArrayRef dims, Tensor& out);
@@ -99,7 +99,7 @@ bool check_unbind_copy_args(const Tensor& in, int64_t dim, TensorList out);
 void get_permute_copy_out_target_size(
     const Tensor& in,
     IntArrayRef dims,
-    exec_aten::SizesType* out_sizes,
+    executorch::aten::SizesType* out_sizes,
     size_t* out_ndim);
 
 bool check_pixel_shuffle_args(
@@ -110,7 +110,7 @@ bool check_pixel_shuffle_args(
 void get_pixel_shuffle_out_target_size(
     const Tensor& in,
     int64_t upscale_factor,
-    exec_aten::SizesType* out_sizes,
+    executorch::aten::SizesType* out_sizes,
     size_t* out_ndim);
 
 bool check_pixel_unshuffle_args(
@@ -121,7 +121,7 @@ bool check_pixel_unshuffle_args(
 void get_pixel_unshuffle_out_target_size(
     const Tensor& in,
     int64_t upscale_factor,
-    exec_aten::SizesType* out_sizes,
+    executorch::aten::SizesType* out_sizes,
     size_t* out_ndim);
 
 bool check_select_copy_out_args(
@@ -133,12 +133,12 @@ bool check_select_copy_out_args(
 void get_select_copy_out_target_size(
     const Tensor& in,
     int64_t dim,
-    exec_aten::SizesType* out_sizes,
+    executorch::aten::SizesType* out_sizes,
     size_t* out_ndim);
 
 bool check_split_with_sizes_copy_args(
     const Tensor& in,
-    exec_aten::ArrayRef<int64_t> split_sizes,
+    executorch::aten::ArrayRef<int64_t> split_sizes,
     int64_t dim,
     TensorList out);
 
@@ -146,7 +146,7 @@ void get_split_with_sizes_copy_out_target_size(
     const Tensor& in,
     int64_t split_size,
     int64_t dim,
-    exec_aten::SizesType* out_sizes,
+    executorch::aten::SizesType* out_sizes,
     size_t* out_ndim);
 
 bool check_squeeze_copy_dim_args(
@@ -157,29 +157,29 @@ bool check_squeeze_copy_dim_args(
 void get_squeeze_copy_dim_out_target_size(
     const Tensor in,
     int64_t dim,
-    exec_aten::SizesType* out_sizes,
+    executorch::aten::SizesType* out_sizes,
     size_t* out_ndim);
 
 bool check_squeeze_copy_dims_args(
     const Tensor in,
-    const exec_aten::ArrayRef<int64_t> dims,
+    const executorch::aten::ArrayRef<int64_t> dims,
     const Tensor out);
 
 void get_squeeze_copy_dims_out_target_size(
     const Tensor in,
-    const exec_aten::ArrayRef<int64_t> dims,
-    exec_aten::SizesType* out_sizes,
+    const executorch::aten::ArrayRef<int64_t> dims,
+    executorch::aten::SizesType* out_sizes,
     size_t* out_ndim);
 
 bool check_stack_args(
-    exec_aten::ArrayRef<Tensor> tensors,
+    executorch::aten::ArrayRef<Tensor> tensors,
     int64_t dim,
     Tensor& out);
 
 void get_stack_out_target_size(
-    exec_aten::ArrayRef<Tensor> tensors,
+    executorch::aten::ArrayRef<Tensor> tensors,
     int64_t dim,
-    exec_aten::SizesType* out_sizes,
+    executorch::aten::SizesType* out_sizes,
     size_t* out_ndim);
 
 bool check_tril_args(const Tensor& in, Tensor& out);
@@ -193,13 +193,13 @@ bool check_split_copy_args(
 bool check_to_copy_args(
     const Tensor& input,
     bool non_blocking,
-    exec_aten::optional<exec_aten::MemoryFormat> memory_format,
+    executorch::aten::optional<executorch::aten::MemoryFormat> memory_format,
     Tensor& out);
 
 bool check__to_dim_order_copy_args(
     const Tensor& input,
     bool non_blocking,
-    exec_aten::OptionalArrayRef<int64_t> dim_order,
+    executorch::aten::OptionalArrayRef<int64_t> dim_order,
     Tensor& out);
 
 bool check_unsqueeze_copy_args(
@@ -209,14 +209,14 @@ bool check_unsqueeze_copy_args(
 
 bool check_view_copy_args(
     const Tensor& self,
-    exec_aten::ArrayRef<int64_t> size_int64_t,
+    executorch::aten::ArrayRef<int64_t> size_int64_t,
     Tensor& out);
 
 bool get_view_copy_target_size(
     const Tensor input,
-    exec_aten::ArrayRef<int64_t> size_int64_t,
+    executorch::aten::ArrayRef<int64_t> size_int64_t,
     int64_t dim,
-    exec_aten::SizesType* out_sizes);
+    executorch::aten::SizesType* out_sizes);
 
 bool check_diagonal_copy_args(
     const Tensor& in,
@@ -229,7 +229,7 @@ void get_diagonal_copy_out_target_size(
     int64_t offset,
     int64_t dim1,
     int64_t dim2,
-    exec_aten::SizesType* out_sizes,
+    executorch::aten::SizesType* out_sizes,
     size_t* out_ndim);
 
 } // namespace executor
diff --git a/kernels/portable/cpu/util/index_util.cpp b/kernels/portable/cpu/util/index_util.cpp
index 39c556fa01..fb54980bb4 100644
--- a/kernels/portable/cpu/util/index_util.cpp
+++ b/kernels/portable/cpu/util/index_util.cpp
@@ -111,7 +111,7 @@ void get_index_select_out_target_size(
     const Tensor& in,
     int64_t dim,
     const Tensor& index,
-    exec_aten::SizesType* out_sizes,
+    executorch::aten::SizesType* out_sizes,
     size_t* out_ndim) {
   *out_ndim = in.dim();
   for (size_t i = 0; i < in.dim(); ++i) {
diff --git a/kernels/portable/cpu/util/index_util.h b/kernels/portable/cpu/util/index_util.h
index 0ee430c972..80f8574d95 100644
--- a/kernels/portable/cpu/util/index_util.h
+++ b/kernels/portable/cpu/util/index_util.h
@@ -31,7 +31,7 @@ void get_index_select_out_target_size(
     const Tensor& in,
     int64_t dim,
     const Tensor& index,
-    exec_aten::SizesType* out_sizes,
+    executorch::aten::SizesType* out_sizes,
     size_t* out_ndim);
 
 bool check_nonzero_args(const Tensor& in, const Tensor& out);
diff --git a/kernels/portable/cpu/util/kernel_ops_util.cpp b/kernels/portable/cpu/util/kernel_ops_util.cpp
index 649526c94b..2e267b5771 100644
--- a/kernels/portable/cpu/util/kernel_ops_util.cpp
+++ b/kernels/portable/cpu/util/kernel_ops_util.cpp
@@ -14,7 +14,7 @@
 namespace torch {
 namespace executor {
 
-using Tensor = exec_aten::Tensor;
+using Tensor = executorch::aten::Tensor;
 
 namespace {
 
@@ -134,7 +134,7 @@ bool output_padding_is_valid(
 }
 
 bool output_size_is_valid(
-    exec_aten::ArrayRef<exec_aten::SizesType> output_size,
+    executorch::aten::ArrayRef<executorch::aten::SizesType> output_size,
     size_t kernel_ndim) {
   bool valid = true;
   size_t out_dim = output_size.size();
@@ -164,7 +164,7 @@ bool output_size_is_valid(
 void get_unsqueezed_sizes(
     const Tensor& t,
     int64_t unsqueeze_dim,
-    exec_aten::SizesType* sizes_arr,
+    executorch::aten::SizesType* sizes_arr,
     size_t& ndim) {
   ndim = t.dim() + 1;
   for (int d = 0; d < unsqueeze_dim; ++d) {
@@ -178,11 +178,11 @@ void get_unsqueezed_sizes(
 
 void get_unsqueezed_dim_order(
     const Tensor& t,
-    exec_aten::DimOrderType unsqueeze_dim,
-    exec_aten::DimOrderType* dim_order_arr) {
+    executorch::aten::DimOrderType unsqueeze_dim,
+    executorch::aten::DimOrderType* dim_order_arr) {
   int offset = 0;
   for (int i = 0; i < t.dim(); ++i) {
-    exec_aten::DimOrderType dim = t.dim_order()[i];
+    executorch::aten::DimOrderType dim = t.dim_order()[i];
     if (dim == unsqueeze_dim) {
       dim_order_arr[i] = dim;
       dim_order_arr[i + 1] = dim + 1;
@@ -227,7 +227,7 @@ void calculate_kernel_output_sizes(
     IntArrayRef stride,
     IntArrayRef padding,
     IntArrayRef dilation,
-    exec_aten::SizesType* out_sizes,
+    executorch::aten::SizesType* out_sizes,
     bool ceil_mode,
     bool transposed,
     IntArrayRef output_padding) {
@@ -265,7 +265,7 @@ bool check_avg_pool2d_args(
     const IntArrayRef padding,
     const bool ceil_mode,
     const bool count_include_pad,
-    const exec_aten::optional<int64_t>& divisor_override,
+    const executorch::aten::optional<int64_t>& divisor_override,
     const Tensor& out) {
   ET_LOG_AND_RETURN_IF_FALSE(tensors_have_same_dtype(in, out));
 
@@ -300,7 +300,7 @@ void get_avg_pool2d_out_target_size(
     const IntArrayRef stride,
     const IntArrayRef padding,
     const bool ceil_mode,
-    exec_aten::SizesType* const out_sizes,
+    executorch::aten::SizesType* const out_sizes,
     size_t* const out_ndim) {
   *out_ndim = in.dim();
 
@@ -319,7 +319,7 @@ void get_avg_pool2d_out_target_size(
 bool check_convolution_args(
     const Tensor& in,
     const Tensor& weight,
-    const exec_aten::optional<Tensor>& bias,
+    const executorch::aten::optional<Tensor>& bias,
     IntArrayRef stride,
     IntArrayRef padding,
     IntArrayRef dilation,
@@ -412,7 +412,7 @@ void get_convolution_out_target_size(
     bool transposed,
     IntArrayRef output_padding,
     int64_t groups,
-    exec_aten::SizesType* out_sizes,
+    executorch::aten::SizesType* out_sizes,
     size_t* out_ndim) {
   *out_ndim = in.dim();
 
@@ -503,7 +503,7 @@ void get_max_pool2d_with_indices_out_target_size(
     IntArrayRef padding,
     IntArrayRef dilation,
     bool ceil_mode,
-    exec_aten::SizesType* out_sizes,
+    executorch::aten::SizesType* out_sizes,
     size_t* out_ndim) {
   *out_ndim = in.dim();
 
diff --git a/kernels/portable/cpu/util/kernel_ops_util.h b/kernels/portable/cpu/util/kernel_ops_util.h
index 6b06e231f5..812e887111 100644
--- a/kernels/portable/cpu/util/kernel_ops_util.h
+++ b/kernels/portable/cpu/util/kernel_ops_util.h
@@ -46,19 +46,19 @@ bool padding_is_valid(
 bool dilation_is_valid(IntArrayRef dilation, size_t kernel_ndim);
 
 bool output_size_is_valid(
-    exec_aten::ArrayRef<exec_aten::SizesType> output_size,
+    executorch::aten::ArrayRef<executorch::aten::SizesType> output_size,
     size_t kernel_ndim);
 
 void get_unsqueezed_sizes(
     const Tensor& t,
     int64_t unsqueeze_dim,
-    exec_aten::SizesType* sizes_arr,
+    executorch::aten::SizesType* sizes_arr,
     size_t& ndim);
 
 void get_unsqueezed_dim_order(
     const Tensor& t,
-    exec_aten::DimOrderType unsqueeze_dim,
-    exec_aten::DimOrderType* dim_order_arr);
+    executorch::aten::DimOrderType unsqueeze_dim,
+    executorch::aten::DimOrderType* dim_order_arr);
 
 /**
  * Given an input tensor and N-dim kernel parameters, calculates the output size
@@ -71,7 +71,7 @@ void calculate_kernel_output_sizes(
     IntArrayRef stride,
     IntArrayRef padding,
     IntArrayRef dilation,
-    exec_aten::SizesType* out_sizes,
+    executorch::aten::SizesType* out_sizes,
     bool ceil_mode = false,
     bool transposed = false,
     IntArrayRef output_padding = {});
@@ -142,15 +142,15 @@ void kernel_reduction_then_map_2d(
     const MapOp& map_fn,
     const bool include_pad,
     const CTYPE* const in_ptr,
-    const exec_aten::ArrayRef<exec_aten::SizesType> in_sizes,
-    const exec_aten::ArrayRef<exec_aten::StridesType> in_strides,
+    const executorch::aten::ArrayRef<executorch::aten::SizesType> in_sizes,
+    const executorch::aten::ArrayRef<executorch::aten::StridesType> in_strides,
     const IntArrayRef kernel_size,
     const IntArrayRef stride,
     const IntArrayRef padding,
     const IntArrayRef dilation,
     CTYPE* const out_ptr,
-    const exec_aten::ArrayRef<exec_aten::SizesType> out_sizes,
-    const exec_aten::ArrayRef<exec_aten::StridesType> out_strides,
+    const executorch::aten::ArrayRef<executorch::aten::SizesType> out_sizes,
+    const executorch::aten::ArrayRef<executorch::aten::StridesType> out_strides,
     int64_t* const indices_ptr,
     const size_t batch,
     const size_t out_c) {
@@ -163,8 +163,8 @@ void kernel_reduction_then_map_2d(
   size_t out_W = out_sizes[in_dim - 1];
   size_t in_W = in_sizes[in_dim - 1];
 
-  exec_aten::SizesType in_coord[kTensorDimensionLimit];
-  exec_aten::SizesType out_coord[kTensorDimensionLimit];
+  executorch::aten::SizesType in_coord[kTensorDimensionLimit];
+  executorch::aten::SizesType out_coord[kTensorDimensionLimit];
   if (in_dim == 4) {
     in_coord[0] = batch;
     out_coord[0] = batch;
@@ -326,18 +326,21 @@ void apply_kernel_2d_reduce_then_map_fn(
     const IntArrayRef padding,
     const IntArrayRef dilation,
     Tensor& out,
-    exec_aten::optional<Tensor> indices = {}) {
-  exec_aten::ArrayRef<exec_aten::SizesType> in_sizes = in.sizes();
-  exec_aten::ArrayRef<exec_aten::SizesType> out_sizes = out.sizes();
+    executorch::aten::optional<Tensor> indices = {}) {
+  executorch::aten::ArrayRef<executorch::aten::SizesType> in_sizes = in.sizes();
+  executorch::aten::ArrayRef<executorch::aten::SizesType> out_sizes =
+      out.sizes();
 
-  exec_aten::ArrayRef<exec_aten::DimOrderType> in_dim_order = in.dim_order();
-  exec_aten::ArrayRef<exec_aten::DimOrderType> out_dim_order = out.dim_order();
+  executorch::aten::ArrayRef<executorch::aten::DimOrderType> in_dim_order =
+      in.dim_order();
+  executorch::aten::ArrayRef<executorch::aten::DimOrderType> out_dim_order =
+      out.dim_order();
 
-  exec_aten::StridesType in_strides[kTensorDimensionLimit];
+  executorch::aten::StridesType in_strides[kTensorDimensionLimit];
   dim_order_to_stride_nocheck(
       in_sizes.data(), in_dim_order.data(), in_sizes.size(), in_strides);
 
-  exec_aten::StridesType out_strides[kTensorDimensionLimit];
+  executorch::aten::StridesType out_strides[kTensorDimensionLimit];
   dim_order_to_stride_nocheck(
       out_sizes.data(), out_dim_order.data(), out_sizes.size(), out_strides);
 
@@ -389,7 +392,7 @@ bool check_avg_pool2d_args(
     const IntArrayRef padding,
     const bool ceil_mode,
     const bool count_include_pad,
-    const exec_aten::optional<int64_t>& divisor_override,
+    const executorch::aten::optional<int64_t>& divisor_override,
     const Tensor& out);
 
 void get_avg_pool2d_out_target_size(
@@ -398,13 +401,13 @@ void get_avg_pool2d_out_target_size(
     const IntArrayRef stride,
     const IntArrayRef padding,
     const bool ceil_mode,
-    exec_aten::SizesType* const out_sizes,
+    executorch::aten::SizesType* const out_sizes,
     size_t* const out_ndim);
 
 bool check_convolution_args(
     const Tensor& in,
     const Tensor& weight,
-    const exec_aten::optional<Tensor>& bias,
+    const executorch::aten::optional<Tensor>& bias,
     IntArrayRef stride,
     IntArrayRef padding,
     IntArrayRef dilation,
@@ -422,7 +425,7 @@ void get_convolution_out_target_size(
     bool transposed,
     IntArrayRef output_padding,
     int64_t groups,
-    exec_aten::SizesType* out_sizes,
+    executorch::aten::SizesType* out_sizes,
     size_t* out_ndim);
 
 bool check_cumsum_args(
@@ -448,7 +451,7 @@ void get_max_pool2d_with_indices_out_target_size(
     IntArrayRef padding,
     IntArrayRef dilation,
     bool ceil_mode,
-    exec_aten::SizesType* out_sizes,
+    executorch::aten::SizesType* out_sizes,
     size_t* out_ndim);
 
 bool check_masked_fill_args(
diff --git a/kernels/portable/cpu/util/math_util.h b/kernels/portable/cpu/util/math_util.h
index e6cee5eec7..2ba068da18 100644
--- a/kernels/portable/cpu/util/math_util.h
+++ b/kernels/portable/cpu/util/math_util.h
@@ -97,8 +97,8 @@ INT_T max_override(INT_T a, INT_T b) {
 template <
     typename T,
     typename std::enable_if_t<
-        std::is_same_v<T, exec_aten::Half> ||
-            std::is_same_v<T, exec_aten::BFloat16>,
+        std::is_same_v<T, executorch::aten::Half> ||
+            std::is_same_v<T, executorch::aten::BFloat16>,
         bool> = true>
 T min_override(T a, T b) {
   const auto float_a = static_cast<float>(a);
@@ -119,8 +119,8 @@ T min_override(T a, T b) {
 template <
     typename T,
     typename std::enable_if_t<
-        std::is_same_v<T, exec_aten::Half> ||
-            std::is_same_v<T, exec_aten::BFloat16>,
+        std::is_same_v<T, executorch::aten::Half> ||
+            std::is_same_v<T, executorch::aten::BFloat16>,
         bool> = true>
 T max_override(T a, T b) {
   const auto float_a = static_cast<float>(a);
diff --git a/kernels/portable/cpu/util/matmul_ops_util.cpp b/kernels/portable/cpu/util/matmul_ops_util.cpp
index 3d4f2e5e9b..2ff99724ca 100644
--- a/kernels/portable/cpu/util/matmul_ops_util.cpp
+++ b/kernels/portable/cpu/util/matmul_ops_util.cpp
@@ -14,7 +14,7 @@
 namespace torch {
 namespace executor {
 
-using Tensor = exec_aten::Tensor;
+using Tensor = executorch::aten::Tensor;
 
 bool check_addmm_args(
     const Tensor& in,
diff --git a/kernels/portable/cpu/util/normalization_ops_util.cpp b/kernels/portable/cpu/util/normalization_ops_util.cpp
index f16963aa5f..684417f448 100644
--- a/kernels/portable/cpu/util/normalization_ops_util.cpp
+++ b/kernels/portable/cpu/util/normalization_ops_util.cpp
@@ -13,14 +13,14 @@
 namespace torch {
 namespace executor {
 
-using Tensor = exec_aten::Tensor;
+using Tensor = executorch::aten::Tensor;
 
 bool check_batch_norm_args(
     const Tensor& in,
-    const exec_aten::optional<Tensor>& weight,
-    const exec_aten::optional<Tensor>& bias,
-    const exec_aten::optional<Tensor>& running_mean,
-    const exec_aten::optional<Tensor>& running_var,
+    const executorch::aten::optional<Tensor>& weight,
+    const executorch::aten::optional<Tensor>& bias,
+    const executorch::aten::optional<Tensor>& running_mean,
+    const executorch::aten::optional<Tensor>& running_var,
     double momentum,
     double eps,
     Tensor& out,
@@ -75,8 +75,8 @@ bool check_batch_norm_args(
 bool check_layer_norm_args(
     const Tensor& in,
     IntArrayRef normalized_shape,
-    const exec_aten::optional<Tensor>& weight,
-    const exec_aten::optional<Tensor>& bias,
+    const executorch::aten::optional<Tensor>& weight,
+    const executorch::aten::optional<Tensor>& bias,
     Tensor& out,
     Tensor& mean_out,
     Tensor& rstd_out) {
@@ -93,9 +93,9 @@ bool check_layer_norm_args(
         in.size(d + shift) == normalized_shape[d],
         "Expected normalized_shape to match the sizes of input's rightmost dimensions.");
   }
-  exec_aten::SizesType shape[ndim];
+  executorch::aten::SizesType shape[ndim];
   for (size_t i = 0; i < ndim; ++i) {
-    shape[i] = static_cast<exec_aten::SizesType>(normalized_shape[i]);
+    shape[i] = static_cast<executorch::aten::SizesType>(normalized_shape[i]);
   }
 
   if (weight.has_value()) {
@@ -132,8 +132,8 @@ void get_layer_norm_out_target_size(
 
 bool check_group_norm_args(
     const Tensor& in,
-    const exec_aten::optional<Tensor>& weight,
-    const exec_aten::optional<Tensor>& bias,
+    const executorch::aten::optional<Tensor>& weight,
+    const executorch::aten::optional<Tensor>& bias,
     int64_t N,
     int64_t C,
     int64_t HxW,
diff --git a/kernels/portable/cpu/util/normalization_ops_util.h b/kernels/portable/cpu/util/normalization_ops_util.h
index fb4d889785..ad2dc10d2f 100644
--- a/kernels/portable/cpu/util/normalization_ops_util.h
+++ b/kernels/portable/cpu/util/normalization_ops_util.h
@@ -15,10 +15,10 @@ namespace executor {
 
 bool check_batch_norm_args(
     const Tensor& in,
-    const exec_aten::optional<Tensor>& weight,
-    const exec_aten::optional<Tensor>& bias,
-    const exec_aten::optional<Tensor>& running_mean,
-    const exec_aten::optional<Tensor>& running_var,
+    const executorch::aten::optional<Tensor>& weight,
+    const executorch::aten::optional<Tensor>& bias,
+    const executorch::aten::optional<Tensor>& running_mean,
+    const executorch::aten::optional<Tensor>& running_var,
     double momentum,
     double eps,
     Tensor& out,
@@ -28,8 +28,8 @@ bool check_batch_norm_args(
 bool check_layer_norm_args(
     const Tensor& input,
     IntArrayRef normalized_shape,
-    const exec_aten::optional<Tensor>& weight,
-    const exec_aten::optional<Tensor>& bias,
+    const executorch::aten::optional<Tensor>& weight,
+    const executorch::aten::optional<Tensor>& bias,
     Tensor& out,
     Tensor& mean_out,
     Tensor& rstd_out);
@@ -42,8 +42,8 @@ void get_layer_norm_out_target_size(
 
 bool check_group_norm_args(
     const Tensor& input,
-    const exec_aten::optional<Tensor>& weight,
-    const exec_aten::optional<Tensor>& bias,
+    const executorch::aten::optional<Tensor>& weight,
+    const executorch::aten::optional<Tensor>& bias,
     int64_t N,
     int64_t C,
     int64_t HxW,
diff --git a/kernels/portable/cpu/util/padding_util.cpp b/kernels/portable/cpu/util/padding_util.cpp
index 1724669ec5..251c7f1c44 100644
--- a/kernels/portable/cpu/util/padding_util.cpp
+++ b/kernels/portable/cpu/util/padding_util.cpp
@@ -18,7 +18,7 @@ namespace executor {
 bool check_padding_args(
     int64_t n,
     const Tensor& in,
-    exec_aten::ArrayRef<int64_t> padding,
+    executorch::aten::ArrayRef<int64_t> padding,
     Tensor& out,
     bool reflection) {
   ET_LOG_AND_RETURN_IF_FALSE(padding.size() == 2 * n);
@@ -39,7 +39,7 @@ bool check_padding_args(
 void get_padding_out_target_size(
     int64_t n,
     const Tensor& in,
-    exec_aten::ArrayRef<int64_t> padding,
+    executorch::aten::ArrayRef<int64_t> padding,
     Tensor::SizesType* out_sizes,
     size_t* out_ndim) {
   *out_ndim = in.dim();
diff --git a/kernels/portable/cpu/util/padding_util.h b/kernels/portable/cpu/util/padding_util.h
index 92f2213267..f8aa367a94 100644
--- a/kernels/portable/cpu/util/padding_util.h
+++ b/kernels/portable/cpu/util/padding_util.h
@@ -16,14 +16,14 @@ namespace executor {
 bool check_padding_args(
     int64_t n,
     const Tensor& in,
-    exec_aten::ArrayRef<int64_t> padding,
+    executorch::aten::ArrayRef<int64_t> padding,
     Tensor& out,
     bool reflection = false);
 
 void get_padding_out_target_size(
     int64_t n,
     const Tensor& in,
-    exec_aten::ArrayRef<int64_t> padding,
+    executorch::aten::ArrayRef<int64_t> padding,
     Tensor::SizesType* out_sizes,
     size_t* out_ndim);
 
@@ -42,7 +42,7 @@ void pad1d(
     const PaddingIx& padding_ix,
     const Tensor& in,
     Tensor& out,
-    exec_aten::ArrayRef<int64_t> padding) {
+    executorch::aten::ArrayRef<int64_t> padding) {
   const CTYPE* const in_data = in.const_data_ptr<CTYPE>();
   CTYPE* const out_data = out.mutable_data_ptr<CTYPE>();
 
@@ -67,7 +67,7 @@ void pad2d(
     const PaddingIx& padding_ix,
     const Tensor& in,
     Tensor& out,
-    exec_aten::ArrayRef<int64_t> padding) {
+    executorch::aten::ArrayRef<int64_t> padding) {
   const CTYPE* const in_data = in.const_data_ptr<CTYPE>();
   CTYPE* const out_data = out.mutable_data_ptr<CTYPE>();
 
@@ -100,7 +100,7 @@ void pad3d(
     const PaddingIx& padding_ix,
     const Tensor& in,
     Tensor& out,
-    exec_aten::ArrayRef<int64_t> padding) {
+    executorch::aten::ArrayRef<int64_t> padding) {
   const CTYPE* const in_data = in.const_data_ptr<CTYPE>();
   CTYPE* const out_data = out.mutable_data_ptr<CTYPE>();
 
diff --git a/kernels/portable/cpu/util/reduce_util.cpp b/kernels/portable/cpu/util/reduce_util.cpp
index 884c10b813..65140fc664 100644
--- a/kernels/portable/cpu/util/reduce_util.cpp
+++ b/kernels/portable/cpu/util/reduce_util.cpp
@@ -15,7 +15,7 @@
 namespace torch {
 namespace executor {
 
-using Tensor = exec_aten::Tensor;
+using Tensor = executorch::aten::Tensor;
 
 //
 // Helper Functions
@@ -33,8 +33,9 @@ inline size_t _normalize_non_neg_d(ssize_t d, ssize_t in_dim) {
 }
 
 ET_NODISCARD bool check_dim_list_is_valid(
-    const exec_aten::Tensor& in,
-    const exec_aten::optional<exec_aten::ArrayRef<int64_t>>& dim_list) {
+    const executorch::aten::Tensor& in,
+    const executorch::aten::optional<executorch::aten::ArrayRef<int64_t>>&
+        dim_list) {
   if (dim_list.has_value() && dim_list.value().size() != 0) {
     const auto& reduce_dims = dim_list.value();
     bool dim_exist[kTensorDimensionLimit];
@@ -64,7 +65,7 @@ ET_NODISCARD bool check_dim_list_is_valid(
 bool check_dim_in_dim_list(
     const size_t dim,
     const size_t max_dim,
-    const exec_aten::ArrayRef<int64_t>& dim_list) {
+    const executorch::aten::ArrayRef<int64_t>& dim_list) {
   for (const auto& d : dim_list) {
     const size_t non_neg_dim = _normalize_non_neg_d(d, max_dim);
     if (dim == non_neg_dim) {
@@ -79,7 +80,7 @@ bool check_dim_in_dim_list(
  */
 size_t get_reduced_dim_product(
     const Tensor& in,
-    const exec_aten::optional<int64_t>& dim) {
+    const executorch::aten::optional<int64_t>& dim) {
   if (in.dim() == 0) {
     return 1;
   }
@@ -99,7 +100,8 @@ size_t get_reduced_dim_product(
  */
 size_t get_reduced_dim_product(
     const Tensor& in,
-    const exec_aten::optional<exec_aten::ArrayRef<int64_t>>& dim_list) {
+    const executorch::aten::optional<executorch::aten::ArrayRef<int64_t>>&
+        dim_list) {
   if (in.dim() == 0) {
     return 1;
   }
@@ -124,7 +126,7 @@ size_t get_reduced_dim_product(
  */
 size_t get_out_numel(
     const Tensor& in,
-    const exec_aten::optional<int64_t>& dim) {
+    const executorch::aten::optional<int64_t>& dim) {
   size_t out_numel = 1;
   if (dim.has_value()) {
     const auto dim_val = dim.value();
@@ -149,7 +151,8 @@ size_t get_out_numel(
  */
 size_t get_out_numel(
     const Tensor& in,
-    const exec_aten::optional<exec_aten::ArrayRef<int64_t>>& dim_list) {
+    const executorch::aten::optional<executorch::aten::ArrayRef<int64_t>>&
+        dim_list) {
   size_t out_numel = 1;
   if (dim_list.has_value() && dim_list.value().size() != 0) {
     for (size_t d = 0; d < in.dim(); ++d) {
@@ -167,7 +170,7 @@ size_t get_out_numel(
  */
 size_t get_init_index(
     const Tensor& in,
-    const exec_aten::optional<int64_t>& dim,
+    const executorch::aten::optional<int64_t>& dim,
     const size_t out_ix) {
   if (!dim.has_value()) {
     return 0;
@@ -198,7 +201,8 @@ size_t get_init_index(
  */
 size_t get_init_index(
     const Tensor& in,
-    const exec_aten::optional<exec_aten::ArrayRef<int64_t>>& dim_list,
+    const executorch::aten::optional<executorch::aten::ArrayRef<int64_t>>&
+        dim_list,
     const size_t out_ix) {
   if (!dim_list.has_value() || dim_list.value().size() == 0) {
     return 0;
@@ -221,9 +225,9 @@ size_t get_init_index(
 
 size_t compute_reduced_out_size(
     const Tensor& in,
-    const exec_aten::optional<int64_t>& dim,
+    const executorch::aten::optional<int64_t>& dim,
     bool keepdim,
-    exec_aten::SizesType* sizes_arr) {
+    executorch::aten::SizesType* sizes_arr) {
   const auto in_dim = in.dim();
   size_t out_dim = in_dim;
 
@@ -258,9 +262,10 @@ size_t compute_reduced_out_size(
 
 size_t compute_reduced_out_size(
     const Tensor& in,
-    const exec_aten::optional<exec_aten::ArrayRef<int64_t>>& dim_list,
+    const executorch::aten::optional<executorch::aten::ArrayRef<int64_t>>&
+        dim_list,
     bool keepdim,
-    exec_aten::SizesType* sizes_arr) {
+    executorch::aten::SizesType* sizes_arr) {
   const auto in_dim = in.dim();
   size_t out_dim = in_dim;
 
@@ -298,25 +303,26 @@ size_t compute_reduced_out_size(
 
 Error resize_reduction_out(
     const Tensor& in,
-    const exec_aten::optional<int64_t>& dim,
+    const executorch::aten::optional<int64_t>& dim,
     bool keepdim,
     Tensor& out) {
-  exec_aten::SizesType sizes_arr[kTensorDimensionLimit];
+  executorch::aten::SizesType sizes_arr[kTensorDimensionLimit];
   const auto out_dim = compute_reduced_out_size(in, dim, keepdim, sizes_arr);
-  exec_aten::ArrayRef<exec_aten::SizesType> out_size{
+  executorch::aten::ArrayRef<executorch::aten::SizesType> out_size{
       sizes_arr, static_cast<size_t>(out_dim)};
   return resize_tensor(out, out_size);
 }
 
 Error resize_reduction_out(
     const Tensor& in,
-    const exec_aten::optional<exec_aten::ArrayRef<int64_t>>& dim_list,
+    const executorch::aten::optional<executorch::aten::ArrayRef<int64_t>>&
+        dim_list,
     bool keepdim,
     Tensor& out) {
-  exec_aten::SizesType sizes_arr[kTensorDimensionLimit];
+  executorch::aten::SizesType sizes_arr[kTensorDimensionLimit];
   const auto out_dim =
       compute_reduced_out_size(in, dim_list, keepdim, sizes_arr);
-  exec_aten::ArrayRef<exec_aten::SizesType> out_size{
+  executorch::aten::ArrayRef<executorch::aten::SizesType> out_size{
       sizes_arr, static_cast<size_t>(out_dim)};
   return resize_tensor(out, out_size);
 }
diff --git a/kernels/portable/cpu/util/reduce_util.h b/kernels/portable/cpu/util/reduce_util.h
index a41c580c98..25a2c0b44c 100644
--- a/kernels/portable/cpu/util/reduce_util.h
+++ b/kernels/portable/cpu/util/reduce_util.h
@@ -145,38 +145,42 @@ void apply_on_flat_ix_with_dim_mask_and_base(
 //
 
 ET_NODISCARD bool check_dim_list_is_valid(
-    const exec_aten::Tensor& in,
-    const exec_aten::optional<exec_aten::ArrayRef<int64_t>>& dim_list);
+    const executorch::aten::Tensor& in,
+    const executorch::aten::optional<executorch::aten::ArrayRef<int64_t>>&
+        dim_list);
 
 bool check_dim_in_dim_list(
     const size_t dim,
     const size_t max_dim,
-    const exec_aten::ArrayRef<int64_t>& dim_list);
+    const executorch::aten::ArrayRef<int64_t>& dim_list);
 
 size_t get_reduced_dim_product(
-    const exec_aten::Tensor& in,
-    const exec_aten::optional<int64_t>& dim);
+    const executorch::aten::Tensor& in,
+    const executorch::aten::optional<int64_t>& dim);
 
 size_t get_reduced_dim_product(
-    const exec_aten::Tensor& in,
-    const exec_aten::optional<exec_aten::ArrayRef<int64_t>>& dim_list);
+    const executorch::aten::Tensor& in,
+    const executorch::aten::optional<executorch::aten::ArrayRef<int64_t>>&
+        dim_list);
 
 size_t get_out_numel(
-    const exec_aten::Tensor& in,
-    const exec_aten::optional<int64_t>& dim);
+    const executorch::aten::Tensor& in,
+    const executorch::aten::optional<int64_t>& dim);
 
 size_t get_out_numel(
-    const exec_aten::Tensor& in,
-    const exec_aten::optional<exec_aten::ArrayRef<int64_t>>& dim_list);
+    const executorch::aten::Tensor& in,
+    const executorch::aten::optional<executorch::aten::ArrayRef<int64_t>>&
+        dim_list);
 
 size_t get_init_index(
-    const exec_aten::Tensor& in,
-    const exec_aten::optional<int64_t>& dim,
+    const executorch::aten::Tensor& in,
+    const executorch::aten::optional<int64_t>& dim,
     const size_t out_ix);
 
 size_t get_init_index(
-    const exec_aten::Tensor& in,
-    const exec_aten::optional<exec_aten::ArrayRef<int64_t>>& dim_list,
+    const executorch::aten::Tensor& in,
+    const executorch::aten::optional<executorch::aten::ArrayRef<int64_t>>&
+        dim_list,
     const size_t out_ix);
 
 //
@@ -193,8 +197,8 @@ size_t get_init_index(
 template <typename Fn>
 void apply_over_dim(
     const Fn& fn,
-    const exec_aten::Tensor& in,
-    const exec_aten::optional<int64_t>& dim) {
+    const executorch::aten::Tensor& in,
+    const executorch::aten::optional<int64_t>& dim) {
   // If dim is null, apply fn over the entire tensor
   if (!dim.has_value()) {
     fn(in.numel(), 1, 0);
@@ -243,8 +247,8 @@ void apply_over_dim(
 template <typename Fn>
 void apply_over_dim(
     const Fn& fn,
-    const exec_aten::Tensor& in,
-    const exec_aten::optional<int64_t>& dim,
+    const executorch::aten::Tensor& in,
+    const executorch::aten::optional<int64_t>& dim,
     const size_t out_ix,
     const int64_t start = 0,
     const int64_t end = -1) {
@@ -301,8 +305,9 @@ void apply_over_dim(
 template <typename Fn>
 void apply_over_dim_list(
     const Fn& fn,
-    const exec_aten::Tensor& in,
-    const exec_aten::optional<exec_aten::ArrayRef<int64_t>>& dim_list,
+    const executorch::aten::Tensor& in,
+    const executorch::aten::optional<executorch::aten::ArrayRef<int64_t>>&
+        dim_list,
     const size_t out_ix,
     const int64_t start = 0,
     const int64_t end = -1) {
@@ -380,8 +385,8 @@ template <
 std::tuple<CTYPE_OUT, long> map_reduce_over_dim(
     const MapOp& map_fun,
     const ReduceOp& reduce_fun,
-    const exec_aten::Tensor& in,
-    const exec_aten::optional<int64_t>& dim,
+    const executorch::aten::Tensor& in,
+    const executorch::aten::optional<int64_t>& dim,
     const size_t out_ix) {
   if (dim.has_value()) {
     if (in.dim() != 0) {
@@ -456,8 +461,9 @@ template <
 CTYPE_OUT map_reduce_over_dim_list(
     const MapOp& map_fun,
     const ReduceOp& reduce_fun,
-    const exec_aten::Tensor& in,
-    const exec_aten::optional<exec_aten::ArrayRef<int64_t>>& dim_list,
+    const executorch::aten::Tensor& in,
+    const executorch::aten::optional<executorch::aten::ArrayRef<int64_t>>&
+        dim_list,
     const size_t out_ix) {
   ET_CHECK(check_dim_list_is_valid(in, dim_list));
 
@@ -513,8 +519,8 @@ CTYPE_OUT map_reduce_over_dim_list(
 template <typename CTYPE, typename ReduceOp>
 std::tuple<CTYPE, long> reduce_over_dim(
     const ReduceOp& reduce_fun,
-    const exec_aten::Tensor& in,
-    const exec_aten::optional<int64_t>& dim,
+    const executorch::aten::Tensor& in,
+    const executorch::aten::optional<int64_t>& dim,
     const size_t out_ix) {
   return map_reduce_over_dim<CTYPE, CTYPE>(
       [](CTYPE v) { return v; }, reduce_fun, in, dim, out_ix);
@@ -542,8 +548,9 @@ std::tuple<CTYPE, long> reduce_over_dim(
 template <typename CTYPE, typename ReduceOp>
 CTYPE reduce_over_dim_list(
     const ReduceOp& reduce_fun,
-    const exec_aten::Tensor& in,
-    const exec_aten::optional<exec_aten::ArrayRef<int64_t>>& dim_list,
+    const executorch::aten::Tensor& in,
+    const executorch::aten::optional<executorch::aten::ArrayRef<int64_t>>&
+        dim_list,
     const size_t out_ix) {
   return map_reduce_over_dim_list<CTYPE, CTYPE>(
       [](CTYPE v) { return v; }, reduce_fun, in, dim_list, out_ix);
@@ -554,20 +561,21 @@ CTYPE reduce_over_dim_list(
 //
 
 size_t compute_reduced_out_size(
-    const exec_aten::Tensor& in,
-    const exec_aten::optional<int64_t>& dim,
+    const executorch::aten::Tensor& in,
+    const executorch::aten::optional<int64_t>& dim,
     bool keepdim,
-    exec_aten::SizesType* sizes_arr);
+    executorch::aten::SizesType* sizes_arr);
 
 size_t compute_reduced_out_size(
-    const exec_aten::Tensor& in,
-    const exec_aten::optional<exec_aten::ArrayRef<int64_t>>& dim_list,
+    const executorch::aten::Tensor& in,
+    const executorch::aten::optional<executorch::aten::ArrayRef<int64_t>>&
+        dim_list,
     bool keepdim,
-    exec_aten::SizesType* sizes_arr);
+    executorch::aten::SizesType* sizes_arr);
 
 inline ssize_t compute_reduced_out_dim(
-    const exec_aten::Tensor& in,
-    const exec_aten::optional<int64_t>& dim,
+    const executorch::aten::Tensor& in,
+    const executorch::aten::optional<int64_t>& dim,
     bool keepdim) {
   return (
       keepdim                                ? in.dim()
@@ -576,8 +584,9 @@ inline ssize_t compute_reduced_out_dim(
 }
 
 inline ssize_t compute_reduced_out_dim(
-    const exec_aten::Tensor& in,
-    const exec_aten::optional<exec_aten::ArrayRef<int64_t>>& dim_list,
+    const executorch::aten::Tensor& in,
+    const executorch::aten::optional<executorch::aten::ArrayRef<int64_t>>&
+        dim_list,
     bool keepdim) {
   return (
       keepdim ? in.dim()
@@ -593,16 +602,17 @@ inline ssize_t compute_reduced_out_dim(
 //
 
 Error resize_reduction_out(
-    const exec_aten::Tensor& in,
-    const exec_aten::optional<int64_t>& dim,
+    const executorch::aten::Tensor& in,
+    const executorch::aten::optional<int64_t>& dim,
     bool keepdim,
-    exec_aten::Tensor& out);
+    executorch::aten::Tensor& out);
 
 Error resize_reduction_out(
-    const exec_aten::Tensor& in,
-    const exec_aten::optional<exec_aten::ArrayRef<int64_t>>& dim_list,
+    const executorch::aten::Tensor& in,
+    const executorch::aten::optional<executorch::aten::ArrayRef<int64_t>>&
+        dim_list,
     bool keepdim,
-    exec_aten::Tensor& out);
+    executorch::aten::Tensor& out);
 
 #ifndef USE_ATEN_LIB
 bool check_reduction_args(
diff --git a/kernels/portable/cpu/util/repeat_util.cpp b/kernels/portable/cpu/util/repeat_util.cpp
index 9acb7ba088..d373a86c16 100644
--- a/kernels/portable/cpu/util/repeat_util.cpp
+++ b/kernels/portable/cpu/util/repeat_util.cpp
@@ -16,13 +16,13 @@
 namespace torch {
 namespace executor {
 
-using Tensor = exec_aten::Tensor;
+using Tensor = executorch::aten::Tensor;
 
 namespace {
 
 bool check_repeat_args(
     Tensor self,
-    exec_aten::ArrayRef<int64_t> repeats,
+    executorch::aten::ArrayRef<int64_t> repeats,
     Tensor& out) {
   // Ensure the self tensors list is non-empty.
   ET_LOG_MSG_AND_RETURN_IF_FALSE(
@@ -108,8 +108,8 @@ void repeat_internal(
   // Treats zero-dim self as one-dim tensor with size {1}.
   ssize_t self_dim = self.dim() ? self.dim() : 1;
   int32_t one = 1;
-  exec_aten::ArrayRef<int32_t> self_size =
-      self.dim() ? self.sizes() : exec_aten::ArrayRef<int32_t>(&one, 1);
+  executorch::aten::ArrayRef<int32_t> self_size =
+      self.dim() ? self.sizes() : executorch::aten::ArrayRef<int32_t>(&one, 1);
 
   // Get the size of the array in bytes.
   size_t num_bytes = self_size[self_dim - 1] * out.element_size();
@@ -167,7 +167,7 @@ void repeat_internal(
 // than kTensorDimensionLimit.
 Error repeat_tensor(
     const Tensor& self,
-    exec_aten::ArrayRef<int64_t> repeats,
+    executorch::aten::ArrayRef<int64_t> repeats,
     Tensor& out) {
   // Verify that the args are valid.
   ET_CHECK_OR_RETURN_ERROR(
@@ -194,8 +194,8 @@ Error repeat_tensor(
   // Treats zero-dim self as one-dim tensor with size {1}.
   ssize_t self_dim = self.dim() ? self.dim() : 1;
   int32_t one = 1;
-  exec_aten::ArrayRef<int32_t> self_size = self.sizes().empty()
-      ? exec_aten::ArrayRef<int32_t>(&one, 1)
+  executorch::aten::ArrayRef<int32_t> self_size = self.sizes().empty()
+      ? executorch::aten::ArrayRef<int32_t>(&one, 1)
       : self.sizes();
 
   // Compute the stride (in bytes) along each out tensor dimension.
diff --git a/kernels/portable/cpu/util/repeat_util.h b/kernels/portable/cpu/util/repeat_util.h
index 28f5cfa555..55e0af8659 100644
--- a/kernels/portable/cpu/util/repeat_util.h
+++ b/kernels/portable/cpu/util/repeat_util.h
@@ -23,9 +23,9 @@ namespace executor {
  * @returns The status of the repeat operation.
  */
 Error repeat_tensor(
-    const exec_aten::Tensor& in,
-    exec_aten::ArrayRef<int64_t> repeats,
-    exec_aten::Tensor& out);
+    const executorch::aten::Tensor& in,
+    executorch::aten::ArrayRef<int64_t> repeats,
+    executorch::aten::Tensor& out);
 
 } // namespace executor
 } // namespace torch
diff --git a/kernels/portable/cpu/util/select_copy_util.cpp b/kernels/portable/cpu/util/select_copy_util.cpp
index 2564317b04..741c4f2e7e 100644
--- a/kernels/portable/cpu/util/select_copy_util.cpp
+++ b/kernels/portable/cpu/util/select_copy_util.cpp
@@ -15,7 +15,7 @@
 namespace torch {
 namespace executor {
 
-using Tensor = exec_aten::Tensor;
+using Tensor = executorch::aten::Tensor;
 
 Error select_copy_util(
     const Tensor& in,
diff --git a/kernels/portable/cpu/util/slice_util.cpp b/kernels/portable/cpu/util/slice_util.cpp
index b9f5260e62..a948a370de 100644
--- a/kernels/portable/cpu/util/slice_util.cpp
+++ b/kernels/portable/cpu/util/slice_util.cpp
@@ -13,7 +13,7 @@
 namespace torch {
 namespace executor {
 
-using Tensor = exec_aten::Tensor;
+using Tensor = executorch::aten::Tensor;
 
 bool check_narrow_copy_args(
     const Tensor& in,
@@ -38,7 +38,7 @@ void get_narrow_copy_out_target_size(
     const Tensor& in,
     int64_t dim,
     int64_t length,
-    exec_aten::SizesType* out_sizes,
+    executorch::aten::SizesType* out_sizes,
     size_t* out_ndim) {
   *out_ndim = in.dim();
 
@@ -65,7 +65,7 @@ void get_slice_copy_out_target_size(
     const Tensor& in,
     int64_t dim,
     int64_t length,
-    exec_aten::SizesType* out_sizes,
+    executorch::aten::SizesType* out_sizes,
     size_t* out_ndim) {
   get_narrow_copy_out_target_size(in, dim, length, out_sizes, out_ndim);
 }
diff --git a/kernels/portable/cpu/util/slice_util.h b/kernels/portable/cpu/util/slice_util.h
index 734f0dd3c6..accfb38724 100644
--- a/kernels/portable/cpu/util/slice_util.h
+++ b/kernels/portable/cpu/util/slice_util.h
@@ -24,7 +24,7 @@ void get_narrow_copy_out_target_size(
     const Tensor& in,
     int64_t dim,
     int64_t length,
-    exec_aten::SizesType* out_sizes,
+    executorch::aten::SizesType* out_sizes,
     size_t* out_ndim);
 
 bool check_slice_copy_args(
@@ -37,7 +37,7 @@ void get_slice_copy_out_target_size(
     const Tensor& in,
     int64_t dim,
     int64_t num_values,
-    exec_aten::SizesType* out_sizes,
+    executorch::aten::SizesType* out_sizes,
     size_t* out_ndim);
 
 bool check_slice_scatter_args(
diff --git a/kernels/portable/cpu/util/test/broadcast_test.cpp b/kernels/portable/cpu/util/test/broadcast_test.cpp
index d87e8ecec8..23640f8468 100644
--- a/kernels/portable/cpu/util/test/broadcast_test.cpp
+++ b/kernels/portable/cpu/util/test/broadcast_test.cpp
@@ -18,8 +18,8 @@
 #include <gtest/gtest.h>
 
 using namespace ::testing;
-using exec_aten::ScalarType;
-using exec_aten::Tensor;
+using executorch::aten::ScalarType;
+using executorch::aten::Tensor;
 using executorch::runtime::ArrayRef;
 using executorch::runtime::testing::TensorFactory;
 using torch::executor::broadcast_tensor;
diff --git a/kernels/portable/cpu/util/test/reduce_test.cpp b/kernels/portable/cpu/util/test/reduce_test.cpp
index e7bb03c30c..7552a22d3b 100644
--- a/kernels/portable/cpu/util/test/reduce_test.cpp
+++ b/kernels/portable/cpu/util/test/reduce_test.cpp
@@ -15,10 +15,10 @@
 #include <gtest/gtest.h>
 
 using namespace ::testing;
-using exec_aten::ArrayRef;
-using exec_aten::optional;
-using exec_aten::ScalarType;
-using exec_aten::Tensor;
+using executorch::aten::ArrayRef;
+using executorch::aten::optional;
+using executorch::aten::ScalarType;
+using executorch::aten::Tensor;
 using executorch::runtime::testing::TensorFactory;
 using torch::executor::apply_over_dim;
 using torch::executor::apply_over_dim_list;
diff --git a/kernels/portable/cpu/util/transpose_util.h b/kernels/portable/cpu/util/transpose_util.h
index 87e0f302d6..453446fd84 100644
--- a/kernels/portable/cpu/util/transpose_util.h
+++ b/kernels/portable/cpu/util/transpose_util.h
@@ -14,8 +14,8 @@
 namespace torch {
 namespace executor {
 
-using SizesType = exec_aten::SizesType;
-using StridesType = exec_aten::StridesType;
+using SizesType = executorch::aten::SizesType;
+using StridesType = executorch::aten::StridesType;
 
 /**
  * Returns a tensor that is a transposed version of input in out.
diff --git a/kernels/portable/cpu/util/upsample_util.cpp b/kernels/portable/cpu/util/upsample_util.cpp
index 15cf28b663..ff3220f988 100644
--- a/kernels/portable/cpu/util/upsample_util.cpp
+++ b/kernels/portable/cpu/util/upsample_util.cpp
@@ -14,8 +14,8 @@ namespace executor {
 
 bool check_upsample_2d_common_args(
     const Tensor& in,
-    const exec_aten::OptionalArrayRef<int64_t>& output_size,
-    const exec_aten::OptionalArrayRef<double>& scale_factors,
+    const executorch::aten::OptionalArrayRef<int64_t>& output_size,
+    const executorch::aten::OptionalArrayRef<double>& scale_factors,
     Tensor& out) {
   ET_LOG_AND_RETURN_IF_FALSE(tensors_have_same_dtype(in, out));
   ET_LOG_AND_RETURN_IF_FALSE(in.dim() == 4);
@@ -39,25 +39,25 @@ bool check_upsample_2d_common_args(
 
 bool check_upsample_bilinear2d_args(
     const Tensor& in,
-    const exec_aten::OptionalArrayRef<int64_t>& output_size,
+    const executorch::aten::OptionalArrayRef<int64_t>& output_size,
     ET_UNUSED const bool align_corners,
-    const exec_aten::OptionalArrayRef<double>& scale_factors,
+    const executorch::aten::OptionalArrayRef<double>& scale_factors,
     Tensor& out) {
   return check_upsample_2d_common_args(in, output_size, scale_factors, out);
 }
 
 bool check_upsample_nearest2d_args(
     const Tensor& in,
-    const exec_aten::OptionalArrayRef<int64_t>& output_size,
-    const exec_aten::OptionalArrayRef<double>& scale_factors,
+    const executorch::aten::OptionalArrayRef<int64_t>& output_size,
+    const executorch::aten::OptionalArrayRef<double>& scale_factors,
     Tensor& out) {
   return check_upsample_2d_common_args(in, output_size, scale_factors, out);
 }
 
 Error resize_upsample_2d(
     const Tensor& in,
-    const exec_aten::OptionalArrayRef<int64_t>& output_size,
-    const exec_aten::OptionalArrayRef<double>& scale_factors,
+    const executorch::aten::OptionalArrayRef<int64_t>& output_size,
+    const executorch::aten::OptionalArrayRef<double>& scale_factors,
     double& scale_h_out,
     double& scale_w_out,
     Tensor& out) {
diff --git a/kernels/portable/cpu/util/upsample_util.h b/kernels/portable/cpu/util/upsample_util.h
index 785b6f1cc8..073c6332d6 100644
--- a/kernels/portable/cpu/util/upsample_util.h
+++ b/kernels/portable/cpu/util/upsample_util.h
@@ -17,27 +17,27 @@ namespace executor {
 
 bool check_upsample_2d_common_args(
     const Tensor& in,
-    const exec_aten::OptionalArrayRef<int64_t>& output_size,
-    const exec_aten::OptionalArrayRef<double>& scale_factors,
+    const executorch::aten::OptionalArrayRef<int64_t>& output_size,
+    const executorch::aten::OptionalArrayRef<double>& scale_factors,
     Tensor& out);
 
 bool check_upsample_bilinear2d_args(
     const Tensor& in,
-    const exec_aten::OptionalArrayRef<int64_t>& output_size,
+    const executorch::aten::OptionalArrayRef<int64_t>& output_size,
     const bool align_corners,
-    const exec_aten::OptionalArrayRef<double>& scale_factors,
+    const executorch::aten::OptionalArrayRef<double>& scale_factors,
     Tensor& out);
 
 bool check_upsample_nearest2d_args(
     const Tensor& in,
-    const exec_aten::OptionalArrayRef<int64_t>& output_size,
-    const exec_aten::OptionalArrayRef<double>& scale_factors,
+    const executorch::aten::OptionalArrayRef<int64_t>& output_size,
+    const executorch::aten::OptionalArrayRef<double>& scale_factors,
     Tensor& out);
 
 Error resize_upsample_2d(
     const Tensor& in,
-    const exec_aten::OptionalArrayRef<int64_t>& output_size,
-    const exec_aten::OptionalArrayRef<double>& scale_factors,
+    const executorch::aten::OptionalArrayRef<int64_t>& output_size,
+    const executorch::aten::OptionalArrayRef<double>& scale_factors,
     double& scale_h_out,
     double& scale_w_out,
     Tensor& out);
@@ -45,7 +45,7 @@ Error resize_upsample_2d(
 // Ported from aten/src/ATen/native/UpSample.h
 template <typename scalar_t>
 inline scalar_t compute_scales_value(
-    const exec_aten::optional<double>& scale,
+    const executorch::aten::optional<double>& scale,
     int64_t input_size,
     int64_t output_size) {
   return scale.has_value() ? static_cast<scalar_t>(1.0 / scale.value())
@@ -58,7 +58,7 @@ inline scalar_t area_pixel_compute_scale(
     int64_t input_size,
     int64_t output_size,
     bool align_corners,
-    const exec_aten::optional<double>& scale) {
+    const executorch::aten::optional<double>& scale) {
   // see Note [area_pixel_compute_scale]
   if (align_corners) {
     if (output_size > 1) {
diff --git a/kernels/portable/test/dtype_selective_build_test.cpp b/kernels/portable/test/dtype_selective_build_test.cpp
index bd5e13b30c..0492ee14b0 100644
--- a/kernels/portable/test/dtype_selective_build_test.cpp
+++ b/kernels/portable/test/dtype_selective_build_test.cpp
@@ -11,7 +11,7 @@
 #include <gtest/gtest.h>
 
 using namespace ::testing;
-using exec_aten::ScalarType;
+using executorch::aten::ScalarType;
 using torch::executor::ScalarTypeToCppType;
 
 TEST(DtypeSelectiveBuildTest, UnknownOp) {
@@ -19,7 +19,7 @@ TEST(DtypeSelectiveBuildTest, UnknownOp) {
       ET_SWITCH_TWO_TYPES(
           Float,
           Int,
-          exec_aten::ScalarType::Float,
+          executorch::aten::ScalarType::Float,
           ctx,
           "unknown.out",
           // @lint-ignore CLANGTIDY clang-diagnostic-unused-local-typedef
@@ -33,7 +33,7 @@ TEST(DtypeSelectiveBuildTest, OpWithoutDtype) {
       ET_SWITCH_TWO_TYPES(
           Float,
           Int,
-          exec_aten::ScalarType::Int,
+          executorch::aten::ScalarType::Int,
           ctx,
           "add.out",
           // @lint-ignore CLANGTIDY clang-diagnostic-unused-local-typedef
@@ -47,7 +47,7 @@ TEST(DtypeSelectiveBuildTest, OpWithDtype) {
       ET_SWITCH_TWO_TYPES(
           Float,
           Int,
-          exec_aten::ScalarType::Float,
+          executorch::aten::ScalarType::Float,
           ctx,
           "add.out",
           // @lint-ignore CLANGTIDY clang-diagnostic-unused-local-typedef
diff --git a/kernels/portable/test/op_allclose_test.cpp b/kernels/portable/test/op_allclose_test.cpp
index 3d1e96264d..99c08c3afc 100644
--- a/kernels/portable/test/op_allclose_test.cpp
+++ b/kernels/portable/test/op_allclose_test.cpp
@@ -17,8 +17,8 @@
 #include <cmath>
 
 using namespace ::testing;
-using exec_aten::ScalarType;
-using exec_aten::Tensor;
+using executorch::aten::ScalarType;
+using executorch::aten::Tensor;
 using torch::executor::native::allclose_out;
 using torch::executor::testing::TensorFactory;
 
diff --git a/kernels/portable/test/op_div_test.cpp b/kernels/portable/test/op_div_test.cpp
index fcd714446f..ee0ba3f6a0 100644
--- a/kernels/portable/test/op_div_test.cpp
+++ b/kernels/portable/test/op_div_test.cpp
@@ -16,11 +16,11 @@
 #include <gtest/gtest.h>
 
 using namespace ::testing;
-using exec_aten::Scalar;
-using exec_aten::ScalarType;
-using exec_aten::SizesType;
-using exec_aten::StridesType;
-using exec_aten::Tensor;
+using executorch::aten::Scalar;
+using executorch::aten::ScalarType;
+using executorch::aten::SizesType;
+using executorch::aten::StridesType;
+using executorch::aten::Tensor;
 using torch::executor::testing::TensorFactory;
 
 // Note: This file is used for testing op_div for *portable kernel specific*.
@@ -32,7 +32,7 @@ class OpDivScalarOutKernelTest : public OperatorTest {
   Tensor& op_div_out_mode(
       const Tensor& a,
       const Tensor& b,
-      exec_aten::optional<exec_aten::string_view> mode,
+      executorch::aten::optional<executorch::aten::string_view> mode,
       Tensor& out) {
     return torch::executor::aten::div_outf(context_, a, b, mode, out);
   }
@@ -43,7 +43,7 @@ class OpDivScalarModeOutKernelTest : public OperatorTest {
   Tensor& op_div_scalar_mode_out(
       const Tensor& a,
       const Scalar& b,
-      exec_aten::optional<exec_aten::string_view> mode,
+      executorch::aten::optional<executorch::aten::string_view> mode,
       Tensor& out) {
     return torch::executor::aten::div_outf(context_, a, b, mode, out);
   }
@@ -60,7 +60,7 @@ TEST_F(OpDivScalarOutKernelTest, SanityCheckModeTrunc) {
   op_div_out_mode(
       tf_a.make(sizes, {1, 2, 4, -9}),
       tf_a.make(sizes, {2, 2, 2, 2}),
-      exec_aten::optional<exec_aten::string_view>("trunc"),
+      executorch::aten::optional<executorch::aten::string_view>("trunc"),
       out);
 
   // Check that it matches the expected output.
@@ -78,7 +78,7 @@ TEST_F(OpDivScalarOutKernelTest, SanityCheckModeFloor) {
   op_div_out_mode(
       tf_a.make(sizes, {1, 2, 4, -9}),
       tf_a.make(sizes, {2, 2, 2, 2}),
-      exec_aten::optional<exec_aten::string_view>("floor"),
+      executorch::aten::optional<executorch::aten::string_view>("floor"),
       out);
 
   // Check that it matches the expected output.
@@ -95,7 +95,7 @@ TEST_F(OpDivScalarModeOutKernelTest, SanityCheckModeTrunc) {
   op_div_scalar_mode_out(
       tf.make(sizes, {1, 2, 4, -9}),
       2,
-      exec_aten::optional<exec_aten::string_view>("trunc"),
+      executorch::aten::optional<executorch::aten::string_view>("trunc"),
       out);
 
   // Check that it matches the expected output.
@@ -112,7 +112,7 @@ TEST_F(OpDivScalarModeOutKernelTest, SanityCheckModeFloor) {
   op_div_scalar_mode_out(
       tf.make(sizes, {1, 2, 4, -9}),
       2,
-      exec_aten::optional<exec_aten::string_view>("floor"),
+      executorch::aten::optional<executorch::aten::string_view>("floor"),
       out);
 
   // Check that it matches the expected output.
diff --git a/kernels/portable/test/op_gelu_test.cpp b/kernels/portable/test/op_gelu_test.cpp
index 7bd3964aed..19e757b4bd 100644
--- a/kernels/portable/test/op_gelu_test.cpp
+++ b/kernels/portable/test/op_gelu_test.cpp
@@ -15,9 +15,9 @@
 #include <gtest/gtest.h>
 
 using namespace ::testing;
-using exec_aten::ScalarType;
-using exec_aten::string_view;
-using exec_aten::Tensor;
+using executorch::aten::ScalarType;
+using executorch::aten::string_view;
+using executorch::aten::Tensor;
 using torch::executor::testing::TensorFactory;
 
 // Note: This file is used for testing op_gelu for *portable kernel specific*.
diff --git a/kernels/portable/test/op_mul_test.cpp b/kernels/portable/test/op_mul_test.cpp
index 963508f6f3..0827f8de92 100644
--- a/kernels/portable/test/op_mul_test.cpp
+++ b/kernels/portable/test/op_mul_test.cpp
@@ -16,10 +16,10 @@
 #include <gtest/gtest.h>
 
 using namespace ::testing;
-using exec_aten::ScalarType;
-using exec_aten::SizesType;
-using exec_aten::StridesType;
-using exec_aten::Tensor;
+using executorch::aten::ScalarType;
+using executorch::aten::SizesType;
+using executorch::aten::StridesType;
+using executorch::aten::Tensor;
 using torch::executor::testing::TensorFactory;
 
 // Note: This file is used for testing op_mul for *portable kernel specific*.
@@ -36,20 +36,21 @@ class OpMulOutKernelTest : public OperatorTest {
 TEST_F(OpMulOutKernelTest, UnhandledDtypeDies) {
   // mul_out() doesn't handle QInt8.
   // TensorFactory cannot be used with ScalarType::QInt8 since
-  // exec_aten::qint8 does not have a default constructor. It must be
+  // executorch::aten::qint8 does not have a default constructor. It must be
   // initialized with an explicit value. So, we need to manually create the
   // underlying data without default construction and then the tensors from that
   // data via TensorImpl.
 
   std::vector<SizesType> sizes = {2, 2};
 
-  std::vector<exec_aten::qint8> a_data{};
-  std::generate_n(
-      std::back_inserter(a_data), 4, []() { return exec_aten::qint8{0}; });
-  std::vector<exec_aten::qint8> b_data(a_data);
-  std::vector<exec_aten::qint8> out_data(a_data);
+  std::vector<executorch::aten::qint8> a_data{};
+  std::generate_n(std::back_inserter(a_data), 4, []() {
+    return executorch::aten::qint8{0};
+  });
+  std::vector<executorch::aten::qint8> b_data(a_data);
+  std::vector<executorch::aten::qint8> out_data(a_data);
 
-  std::vector<exec_aten::DimOrderType> dim_order = {0, 1};
+  std::vector<executorch::aten::DimOrderType> dim_order = {0, 1};
 
   auto a_impl = torch::executor::TensorImpl(
       ScalarType::QInt8, 2, sizes.data(), a_data.data(), dim_order.data());
diff --git a/kernels/portable/test/register_ops_aot_for_test.cpp b/kernels/portable/test/register_ops_aot_for_test.cpp
index 40d4bb79ed..6e71a669cc 100644
--- a/kernels/portable/test/register_ops_aot_for_test.cpp
+++ b/kernels/portable/test/register_ops_aot_for_test.cpp
@@ -24,16 +24,16 @@ namespace native {
 Tensor& upsample_bilinear2d_vec_out(
     KernelRuntimeContext& ctx,
     const Tensor& in,
-    const exec_aten::OptionalArrayRef<int64_t> output_size,
+    const executorch::aten::OptionalArrayRef<int64_t> output_size,
     bool align_corners,
-    const exec_aten::OptionalArrayRef<double> scale_factors,
+    const executorch::aten::OptionalArrayRef<double> scale_factors,
     Tensor& out);
 
 Tensor& upsample_bilinear2d_vec_out_no_context(
     const Tensor& in,
-    const exec_aten::OptionalArrayRef<int64_t> output_size,
+    const executorch::aten::OptionalArrayRef<int64_t> output_size,
     bool align_corners,
-    const exec_aten::OptionalArrayRef<double> scale_factors,
+    const executorch::aten::OptionalArrayRef<double> scale_factors,
     Tensor& out) {
   KernelRuntimeContext ctx;
   auto& ret = upsample_bilinear2d_vec_out(
@@ -51,14 +51,14 @@ Tensor& upsample_bilinear2d_vec_out_no_context(
 Tensor& upsample_nearest2d_vec_out(
     KernelRuntimeContext& ctx,
     const Tensor& in,
-    const exec_aten::OptionalArrayRef<int64_t> output_size,
-    const exec_aten::OptionalArrayRef<double> scale_factors,
+    const executorch::aten::OptionalArrayRef<int64_t> output_size,
+    const executorch::aten::OptionalArrayRef<double> scale_factors,
     Tensor& out);
 
 Tensor& upsample_nearest2d_vec_out_no_context(
     const Tensor& in,
-    const exec_aten::OptionalArrayRef<int64_t> output_size,
-    const exec_aten::OptionalArrayRef<double> scale_factors,
+    const executorch::aten::OptionalArrayRef<int64_t> output_size,
+    const executorch::aten::OptionalArrayRef<double> scale_factors,
     Tensor& out) {
   KernelRuntimeContext ctx;
   auto& ret =
diff --git a/kernels/prim_ops/et_copy_index.cpp b/kernels/prim_ops/et_copy_index.cpp
index cad15173f7..e3d9ae46e5 100644
--- a/kernels/prim_ops/et_copy_index.cpp
+++ b/kernels/prim_ops/et_copy_index.cpp
@@ -13,8 +13,8 @@
 #include <executorch/runtime/kernel/kernel_includes.h>
 #include <executorch/runtime/platform/assert.h>
 
-using exec_aten::SizesType;
-using exec_aten::Tensor;
+using executorch::aten::SizesType;
+using executorch::aten::Tensor;
 using torch::executor::Error;
 using torch::executor::resize_tensor;
 
diff --git a/kernels/prim_ops/et_view.cpp b/kernels/prim_ops/et_view.cpp
index 894996e32f..66aa9ac87e 100644
--- a/kernels/prim_ops/et_view.cpp
+++ b/kernels/prim_ops/et_view.cpp
@@ -15,8 +15,8 @@
 #include <executorch/runtime/core/exec_aten/util/tensor_util.h>
 #include <executorch/runtime/platform/assert.h>
 
-using exec_aten::SizesType;
-using exec_aten::Tensor;
+using executorch::aten::SizesType;
+using executorch::aten::Tensor;
 using torch::executor::Error;
 using torch::executor::resize_tensor;
 
@@ -28,10 +28,10 @@ constexpr size_t kTensorDimensionLimit = 16;
 
 namespace {
 bool get_view_target_size(
-    const exec_aten::Tensor self,
-    exec_aten::ArrayRef<int64_t> size,
+    const executorch::aten::Tensor self,
+    executorch::aten::ArrayRef<int64_t> size,
     int64_t dim,
-    exec_aten::SizesType* out_size) {
+    executorch::aten::SizesType* out_size) {
   ET_LOG_AND_RETURN_IF_FALSE(size.size() == dim);
   int minus1_dim = -1;
   int n_zero = 0;
@@ -48,7 +48,7 @@ bool get_view_target_size(
           size[i] >= -1, "Negative sizes are not allowed.");
 
       numel_without_minus_1 *= size[i];
-      out_size[i] = static_cast<exec_aten::SizesType>(size[i]);
+      out_size[i] = static_cast<executorch::aten::SizesType>(size[i]);
 
       if (size[i] == 0) {
         n_zero++;
diff --git a/kernels/prim_ops/register_prim_ops.cpp b/kernels/prim_ops/register_prim_ops.cpp
index 1d1bdd0776..1d197b6358 100644
--- a/kernels/prim_ops/register_prim_ops.cpp
+++ b/kernels/prim_ops/register_prim_ops.cpp
@@ -77,7 +77,8 @@ static Kernel prim_ops[] = {
           EValue& self = *stack[0];
           EValue& dim = *stack[1];
           EValue& out = *stack[2];
-          exec_aten::Tensor self_tensor = self.to<exec_aten::Tensor>();
+          executorch::aten::Tensor self_tensor =
+              self.to<executorch::aten::Tensor>();
           int64_t dim_val = dim.to<int64_t>();
           int64_t size = self_tensor.size(dim_val);
           out = EValue(size);
@@ -89,7 +90,8 @@ static Kernel prim_ops[] = {
           (void)context;
           EValue& self = *stack[0];
           EValue& out = *stack[1];
-          exec_aten::Tensor self_tensor = self.to<exec_aten::Tensor>();
+          executorch::aten::Tensor self_tensor =
+              self.to<executorch::aten::Tensor>();
           ET_SWITCH_REAL_TYPES_AND(
               Bool,
               self_tensor.scalar_type(),
@@ -107,7 +109,8 @@ static Kernel prim_ops[] = {
           (void)context;
           EValue& self = *stack[0];
           EValue& out = *stack[1];
-          exec_aten::Tensor self_tensor = self.to<exec_aten::Tensor>();
+          executorch::aten::Tensor self_tensor =
+              self.to<executorch::aten::Tensor>();
           int64_t numel = self_tensor.numel();
           out = EValue(numel);
         }),
diff --git a/kernels/prim_ops/test/prim_ops_test.cpp b/kernels/prim_ops/test/prim_ops_test.cpp
index ab6bd28e6c..2efcb53eea 100644
--- a/kernels/prim_ops/test/prim_ops_test.cpp
+++ b/kernels/prim_ops/test/prim_ops_test.cpp
@@ -20,7 +20,7 @@
 #include <cstdint>
 #include <cstdio>
 
-using exec_aten::SizesType;
+using executorch::aten::SizesType;
 using torch::executor::Error;
 using torch::executor::resize_tensor;
 
diff --git a/kernels/quantized/cpu/embeddingxb.cpp b/kernels/quantized/cpu/embeddingxb.cpp
index 5275f842df..eab9a533cf 100644
--- a/kernels/quantized/cpu/embeddingxb.cpp
+++ b/kernels/quantized/cpu/embeddingxb.cpp
@@ -17,9 +17,9 @@ namespace torch {
 namespace executor {
 namespace native {
 
-using Tensor = exec_aten::Tensor;
-using Scalar = exec_aten::Scalar;
-using ScalarType = exec_aten::ScalarType;
+using Tensor = executorch::aten::Tensor;
+using Scalar = executorch::aten::Scalar;
+using ScalarType = executorch::aten::ScalarType;
 
 namespace {
 
@@ -65,11 +65,11 @@ static inline int32_t get_embedding_dim(
 void check_embedding_xbit_args(
     const Tensor& weight,
     const Tensor& weight_scales,
-    const exec_aten::optional<Tensor>& opt_weight_zero_points,
+    const executorch::aten::optional<Tensor>& opt_weight_zero_points,
     const int64_t weight_quant_min,
     const int64_t weight_quant_max,
     const Tensor& indices,
-    exec_aten::optional<ScalarType> out_dtype,
+    executorch::aten::optional<ScalarType> out_dtype,
     Tensor& out,
     int weight_nbit) {
   ET_CHECK_MSG(8 % weight_nbit == 0, "nbit must divide 8");
@@ -170,7 +170,7 @@ template <typename CTYPE_PARAMS, typename CTYPE_OUT>
 void embedding_xbit_per_channel(
     const Tensor& weight,
     const Tensor& weight_scales,
-    const exec_aten::optional<Tensor>& opt_weight_zero_points,
+    const executorch::aten::optional<Tensor>& opt_weight_zero_points,
     const Tensor& indices,
     Tensor& out,
     int weight_nbit) {
@@ -225,14 +225,14 @@ void resize_out_tensor(
     const Tensor& indices,
     Tensor& out,
     int weight_nbit) {
-  exec_aten::SizesType expected_output_size[kTensorDimensionLimit];
+  executorch::aten::SizesType expected_output_size[kTensorDimensionLimit];
   for (size_t i = 0; i < indices.dim(); i++) {
     expected_output_size[i] = indices.size(i);
   }
   const size_t embedding_dim = get_embedding_dim(weight.size(1), weight_nbit);
   expected_output_size[out.dim() - 1] = embedding_dim;
 
-  exec_aten::ArrayRef<exec_aten::SizesType> output_size{
+  executorch::aten::ArrayRef<executorch::aten::SizesType> output_size{
       expected_output_size, static_cast<size_t>(out.dim())};
 
   torch::executor::Error err = resize_tensor(out, output_size);
@@ -260,7 +260,7 @@ Tensor& quantized_embedding_xbit_out(
     // non quant input and returns fp output
     const Tensor& weight,
     const Tensor& weight_scales,
-    const exec_aten::optional<Tensor>& opt_weight_zero_points,
+    const executorch::aten::optional<Tensor>& opt_weight_zero_points,
     const int64_t weight_quant_min,
     const int64_t weight_quant_max,
     const Tensor& indices,
@@ -299,7 +299,7 @@ Tensor& quantized_embedding_xbit_out(
     KernelRuntimeContext& context,
     const Tensor& weight,
     const Tensor& weight_scales,
-    const exec_aten::optional<Tensor>& opt_weight_zero_points,
+    const executorch::aten::optional<Tensor>& opt_weight_zero_points,
     int64_t weight_quant_min,
     int64_t weight_quant_max,
     const Tensor& indices,
@@ -325,11 +325,11 @@ Tensor& quantized_embedding_xbit_dtype_out(
     // non quant input and returns fp output
     const Tensor& weight,
     const Tensor& weight_scales,
-    const exec_aten::optional<Tensor>& opt_weight_zero_points,
+    const executorch::aten::optional<Tensor>& opt_weight_zero_points,
     const int64_t weight_quant_min,
     const int64_t weight_quant_max,
     const Tensor& indices,
-    exec_aten::optional<ScalarType> out_dtype,
+    executorch::aten::optional<ScalarType> out_dtype,
     Tensor& out,
     int weight_nbit) {
   // TODO (jakeszwe): improve these to account for the size of out in relation
@@ -368,11 +368,11 @@ Tensor& quantized_embedding_xbit_dtype_out(
     KernelRuntimeContext& context,
     const Tensor& weight,
     const Tensor& weight_scales,
-    const exec_aten::optional<Tensor>& opt_weight_zero_points,
+    const executorch::aten::optional<Tensor>& opt_weight_zero_points,
     int64_t weight_quant_min,
     int64_t weight_quant_max,
     const Tensor& indices,
-    exec_aten::optional<ScalarType> out_dtype,
+    executorch::aten::optional<ScalarType> out_dtype,
     Tensor& out,
     int weight_nbit) {
   // TODO(larryliu): Add a context arg to the real op function and remove this
diff --git a/kernels/quantized/cpu/embeddingxb.h b/kernels/quantized/cpu/embeddingxb.h
index d08c8ae745..3c8be3d86a 100644
--- a/kernels/quantized/cpu/embeddingxb.h
+++ b/kernels/quantized/cpu/embeddingxb.h
@@ -15,16 +15,16 @@ namespace torch {
 namespace executor {
 namespace native {
 
-using Tensor = exec_aten::Tensor;
-using Scalar = exec_aten::Scalar;
-using ScalarType = exec_aten::ScalarType;
+using Tensor = executorch::aten::Tensor;
+using Scalar = executorch::aten::Scalar;
+using ScalarType = executorch::aten::ScalarType;
 
 Tensor& quantized_embedding_xbit_out(
     // TODO Evaluate whether this name is appropriate for an operator that takes
     // non quant input and returns fp output
     const Tensor& weight,
     const Tensor& weight_scales,
-    const exec_aten::optional<Tensor>& opt_weight_zero_points,
+    const executorch::aten::optional<Tensor>& opt_weight_zero_points,
     const int64_t weight_quant_min,
     const int64_t weight_quant_max,
     const Tensor& indices,
@@ -35,7 +35,7 @@ Tensor& quantized_embedding_xbit_out(
     KernelRuntimeContext& context,
     const Tensor& weight,
     const Tensor& weight_scales,
-    const exec_aten::optional<Tensor>& opt_weight_zero_points,
+    const executorch::aten::optional<Tensor>& opt_weight_zero_points,
     int64_t weight_quant_min,
     int64_t weight_quant_max,
     const Tensor& indices,
@@ -47,11 +47,11 @@ Tensor& quantized_embedding_xbit_dtype_out(
     // non quant input and returns fp output
     const Tensor& weight,
     const Tensor& weight_scales,
-    const exec_aten::optional<Tensor>& opt_weight_zero_points,
+    const executorch::aten::optional<Tensor>& opt_weight_zero_points,
     const int64_t weight_quant_min,
     const int64_t weight_quant_max,
     const Tensor& indices,
-    exec_aten::optional<ScalarType> out_dtype,
+    executorch::aten::optional<ScalarType> out_dtype,
     Tensor& out,
     int weight_nbit);
 
@@ -59,11 +59,11 @@ Tensor& quantized_embedding_xbit_dtype_out(
     KernelRuntimeContext& context,
     const Tensor& weight,
     const Tensor& weight_scales,
-    const exec_aten::optional<Tensor>& opt_weight_zero_points,
+    const executorch::aten::optional<Tensor>& opt_weight_zero_points,
     int64_t weight_quant_min,
     int64_t weight_quant_max,
     const Tensor& indices,
-    exec_aten::optional<ScalarType> out_dtype,
+    executorch::aten::optional<ScalarType> out_dtype,
     Tensor& out,
     int weight_nbit);
 
diff --git a/kernels/quantized/cpu/op_add.cpp b/kernels/quantized/cpu/op_add.cpp
index 279169e3d1..dcaf1f1fe0 100644
--- a/kernels/quantized/cpu/op_add.cpp
+++ b/kernels/quantized/cpu/op_add.cpp
@@ -15,9 +15,9 @@ namespace torch {
 namespace executor {
 namespace native {
 
-using Tensor = exec_aten::Tensor;
-using Scalar = exec_aten::Scalar;
-using ScalarType = exec_aten::ScalarType;
+using Tensor = executorch::aten::Tensor;
+using Scalar = executorch::aten::Scalar;
+using ScalarType = executorch::aten::ScalarType;
 
 namespace {
 
diff --git a/kernels/quantized/cpu/op_choose_qparams.cpp b/kernels/quantized/cpu/op_choose_qparams.cpp
index 14f7d3157a..5335f4bfbd 100644
--- a/kernels/quantized/cpu/op_choose_qparams.cpp
+++ b/kernels/quantized/cpu/op_choose_qparams.cpp
@@ -19,9 +19,9 @@ namespace torch {
 namespace executor {
 namespace native {
 
-using Tensor = exec_aten::Tensor;
-using Scalar = exec_aten::Scalar;
-using ScalarType = exec_aten::ScalarType;
+using Tensor = executorch::aten::Tensor;
+using Scalar = executorch::aten::Scalar;
+using ScalarType = executorch::aten::ScalarType;
 
 namespace {
 
@@ -255,7 +255,7 @@ std::tuple<Tensor&, Tensor&> choose_qparams_per_token_asymmetric_out(
     Tensor& zero_point_out) {
   int64_t quant_min = -128;
   int64_t quant_max = 127;
-  exec_aten::SizesType output_sizes[kTensorDimensionLimit];
+  executorch::aten::SizesType output_sizes[kTensorDimensionLimit];
   for (ssize_t i = 0; i < input.dim() - 1; i++) {
     output_sizes[i] = input.size(i);
   }
diff --git a/kernels/quantized/cpu/op_dequantize.cpp b/kernels/quantized/cpu/op_dequantize.cpp
index ce30b890a5..97c8584e3d 100644
--- a/kernels/quantized/cpu/op_dequantize.cpp
+++ b/kernels/quantized/cpu/op_dequantize.cpp
@@ -22,11 +22,11 @@ namespace torch {
 namespace executor {
 namespace native {
 
-using Tensor = exec_aten::Tensor;
-using Scalar = exec_aten::Scalar;
-using ScalarType = exec_aten::ScalarType;
-using StridesType = exec_aten::StridesType;
-using SizesType = exec_aten::SizesType;
+using Tensor = executorch::aten::Tensor;
+using Scalar = executorch::aten::Scalar;
+using ScalarType = executorch::aten::ScalarType;
+using StridesType = executorch::aten::StridesType;
+using SizesType = executorch::aten::SizesType;
 
 namespace {
 
@@ -38,7 +38,7 @@ void check_dequantize_per_tensor_args(
     int64_t quant_min,
     int64_t quant_max,
     ScalarType dtype,
-    exec_aten::optional<ScalarType>& out_dtype,
+    executorch::aten::optional<ScalarType>& out_dtype,
     Tensor& out) {
   ET_CHECK_MSG(
       input.scalar_type() == ScalarType::Byte ||
@@ -78,7 +78,7 @@ void check_dequantize_per_tensor_args(
 template <typename Fn>
 void apply_over_unpacked_dim(
     const Fn& fn,
-    const exec_aten::Tensor& in,
+    const executorch::aten::Tensor& in,
     const int64_t& dim) {
   if (in.numel() == 0) {
     return;
@@ -170,7 +170,7 @@ float get_scale(const Tensor& scale, size_t channel_ix) {
 bool can_use_optimized_dequantize_per_channel(
     const Tensor& in,
     const ScalarType in_dtype,
-    exec_aten::optional<ScalarType>& out_dtype) {
+    executorch::aten::optional<ScalarType>& out_dtype) {
   bool is_contiguous = false;
 #ifdef USE_ATEN_LIB
   is_contiguous = in.is_contiguous();
@@ -188,13 +188,13 @@ bool can_use_optimized_dequantize_per_channel(
 void dequantize_per_channel_optimized(
     const Tensor& in,
     const Tensor& scales,
-    const exec_aten::optional<Tensor>& opt_zero_points,
+    const executorch::aten::optional<Tensor>& opt_zero_points,
     Tensor& out,
     int64_t axis,
     int64_t quant_min,
     int64_t quant_max,
     ScalarType in_dtype,
-    exec_aten::optional<ScalarType>& out_dtype) {
+    executorch::aten::optional<ScalarType>& out_dtype) {
   check_dequantize_per_tensor_args(
       in, quant_min, quant_max, in_dtype, out_dtype, out);
   ET_CHECK_MSG(
@@ -263,7 +263,7 @@ Tensor& dequantize_per_tensor_out(
     int64_t quant_min,
     int64_t quant_max,
     ScalarType dtype,
-    exec_aten::optional<ScalarType> out_dtype,
+    executorch::aten::optional<ScalarType> out_dtype,
     Tensor& out) {
   torch::executor::Error err = resize_tensor(out, input.sizes());
   ET_CHECK_MSG(
@@ -323,7 +323,7 @@ Tensor& dequantize_per_tensor_tensor_args_out(
     int64_t quant_min,
     int64_t quant_max,
     ScalarType dtype,
-    exec_aten::optional<ScalarType> out_dtype,
+    executorch::aten::optional<ScalarType> out_dtype,
     Tensor& out) {
   ET_CHECK_MSG(
       scale.scalar_type() == ScalarType::Double,
@@ -357,12 +357,12 @@ Tensor& dequantize_per_tensor_tensor_args_out(
 Tensor& dequantize_per_channel_out(
     const Tensor& input,
     const Tensor& scale,
-    const exec_aten::optional<Tensor>& opt_zero_points,
+    const executorch::aten::optional<Tensor>& opt_zero_points,
     int64_t axis,
     int64_t quant_min,
     int64_t quant_max,
     ScalarType dtype,
-    exec_aten::optional<ScalarType> out_dtype,
+    executorch::aten::optional<ScalarType> out_dtype,
     Tensor& out) {
   // normalize axis
   ET_CHECK_MSG(
@@ -428,8 +428,9 @@ Tensor& dequantize_per_channel_out(
     zero_point_data = nullptr;
   }
 
-  exec_aten::optional<exec_aten::ArrayRef<int64_t>> optional_dim_list{
-      exec_aten::ArrayRef<int64_t>{dims, size_t(input.dim() - 1)}};
+  executorch::aten::optional<executorch::aten::ArrayRef<int64_t>>
+      optional_dim_list{
+          executorch::aten::ArrayRef<int64_t>{dims, size_t(input.dim() - 1)}};
 
   // Actual dequantization logic
   // input, out are the input and output tensors
@@ -446,7 +447,7 @@ Tensor& dequantize_per_channel_out(
       const auto* input_data_ptr = input.const_data_ptr<CTYPE_IN>();           \
       ET_CHECK_MSG(                                                            \
           axis == 0, "Axis must be 0 for a single dimensional tensors");       \
-      const exec_aten::optional<int64_t> dim;                                  \
+      const executorch::aten::optional<int64_t> dim;                           \
       apply_over_dim(                                                          \
           [input_data_ptr, out_data_ptr, zero_point_data, &scale](             \
               size_t numel, size_t stride, size_t base_ix) {                   \
@@ -517,12 +518,12 @@ Tensor& dequantize_per_channel_out(
     KernelRuntimeContext& context,
     const Tensor& input,
     const Tensor& scale,
-    const exec_aten::optional<Tensor>& opt_zero_points,
+    const executorch::aten::optional<Tensor>& opt_zero_points,
     int64_t axis,
     int64_t quant_min,
     int64_t quant_max,
     ScalarType dtype,
-    exec_aten::optional<ScalarType> out_dtype,
+    executorch::aten::optional<ScalarType> out_dtype,
     Tensor& out) {
   (void)context;
   torch::executor::Error err = resize_tensor(out, input.sizes());
@@ -550,7 +551,7 @@ Tensor& dequantize_per_tensor_out(
     int64_t quant_min,
     int64_t quant_max,
     ScalarType dtype,
-    exec_aten::optional<ScalarType> out_dtype,
+    executorch::aten::optional<ScalarType> out_dtype,
     Tensor& out) {
   // TODO(larryliu): Add a context arg to the real op function and remove this
   // wrapper
@@ -567,7 +568,7 @@ Tensor& dequantize_per_tensor_tensor_args_out(
     int64_t quant_min,
     int64_t quant_max,
     ScalarType dtype,
-    exec_aten::optional<ScalarType> out_dtype,
+    executorch::aten::optional<ScalarType> out_dtype,
     Tensor& out) {
   // TODO(larryliu): Add a context arg to the real op function and remove this
   // wrapper
@@ -592,18 +593,18 @@ Tensor& dequantize_per_token_out(
   }
   // This unfortunate change is needed because we compile op_quantize for aten
   // mode as well
-  std::array<exec_aten::SizesType, 2> input_sizes;
-  input_sizes[0] = static_cast<exec_aten::SizesType>(num_channels);
+  std::array<executorch::aten::SizesType, 2> input_sizes;
+  input_sizes[0] = static_cast<executorch::aten::SizesType>(num_channels);
   input_sizes[1] =
-      static_cast<exec_aten::SizesType>(input.size(input.dim() - 1));
+      static_cast<executorch::aten::SizesType>(input.size(input.dim() - 1));
 #ifdef USE_ATEN_LIB
   Tensor reshaped_input = at::from_blob(
       input.mutable_data_ptr(),
       input_sizes,
       at::TensorOptions(input.scalar_type()));
 #else
-  std::array<exec_aten::DimOrderType, 2> input_dim_order{0, 1};
-  std::array<exec_aten::StridesType, 2> input_strides;
+  std::array<executorch::aten::DimOrderType, 2> input_dim_order{0, 1};
+  std::array<executorch::aten::StridesType, 2> input_strides;
   dim_order_to_stride_nocheck(
       input_sizes.data(), input_dim_order.data(), 2, input_strides.data());
   void* input_data = input.mutable_data_ptr();
diff --git a/kernels/quantized/cpu/op_embedding.cpp b/kernels/quantized/cpu/op_embedding.cpp
index 0ffe363f2a..b297d91870 100644
--- a/kernels/quantized/cpu/op_embedding.cpp
+++ b/kernels/quantized/cpu/op_embedding.cpp
@@ -15,9 +15,9 @@ namespace torch {
 namespace executor {
 namespace native {
 
-using Tensor = exec_aten::Tensor;
-using Scalar = exec_aten::Scalar;
-using ScalarType = exec_aten::ScalarType;
+using Tensor = executorch::aten::Tensor;
+using Scalar = executorch::aten::Scalar;
+using ScalarType = executorch::aten::ScalarType;
 
 namespace {
 
@@ -27,11 +27,11 @@ namespace {
 void check_embedding_byte_args(
     const Tensor& weight,
     const Tensor& weight_scales,
-    const exec_aten::optional<Tensor>& opt_weight_zero_points,
+    const executorch::aten::optional<Tensor>& opt_weight_zero_points,
     const int64_t weight_quant_min,
     const int64_t weight_quant_max,
     const Tensor& indices,
-    exec_aten::optional<ScalarType> out_dtype,
+    executorch::aten::optional<ScalarType> out_dtype,
     Tensor& out) {
   ET_CHECK_MSG(
       weight.dim() == 2, "weight must be 2D but got() %zd dims", weight.dim());
@@ -129,7 +129,7 @@ template <typename CTYPE_WEIGHT, typename CTYPE_PARAMS, typename CTYPE_OUT>
 void embedding_byte_per_channel(
     const Tensor& weight,
     const Tensor& weight_scales,
-    const exec_aten::optional<Tensor>& opt_weight_zero_points,
+    const executorch::aten::optional<Tensor>& opt_weight_zero_points,
     const Tensor& indices,
     Tensor& out) {
   // An embedding layer nn.Embedding(num_embeddings, embedding_dim) has a
@@ -183,14 +183,14 @@ void resize_out_tensor(
     const Tensor& weight,
     const Tensor& indices,
     Tensor& out) {
-  exec_aten::SizesType expected_output_size[kTensorDimensionLimit];
+  executorch::aten::SizesType expected_output_size[kTensorDimensionLimit];
   for (size_t i = 0; i < indices.dim(); i++) {
     expected_output_size[i] = indices.size(i);
   }
   const size_t embedding_dim = weight.size(1);
   expected_output_size[out.dim() - 1] = embedding_dim;
 
-  exec_aten::ArrayRef<exec_aten::SizesType> output_size{
+  executorch::aten::ArrayRef<executorch::aten::SizesType> output_size{
       expected_output_size, static_cast<size_t>(out.dim())};
 
   torch::executor::Error err = resize_tensor(out, output_size);
@@ -218,7 +218,7 @@ Tensor& quantized_embedding_byte_out(
     // non quant input and returns fp output
     const Tensor& weight,
     const Tensor& weight_scales,
-    const exec_aten::optional<Tensor>& opt_weight_zero_points,
+    const executorch::aten::optional<Tensor>& opt_weight_zero_points,
     const int64_t weight_quant_min,
     const int64_t weight_quant_max,
     const Tensor& indices,
@@ -253,7 +253,7 @@ Tensor& quantized_embedding_byte_out(
     KernelRuntimeContext& context,
     const Tensor& weight,
     const Tensor& weight_scales,
-    const exec_aten::optional<Tensor>& opt_weight_zero_points,
+    const executorch::aten::optional<Tensor>& opt_weight_zero_points,
     int64_t weight_quant_min,
     int64_t weight_quant_max,
     const Tensor& indices,
@@ -277,11 +277,11 @@ Tensor& quantized_embedding_byte_dtype_out(
     // non quant input and returns fp output
     const Tensor& weight,
     const Tensor& weight_scales,
-    const exec_aten::optional<Tensor>& opt_weight_zero_points,
+    const executorch::aten::optional<Tensor>& opt_weight_zero_points,
     const int64_t weight_quant_min,
     const int64_t weight_quant_max,
     const Tensor& indices,
-    exec_aten::optional<ScalarType> out_dtype,
+    executorch::aten::optional<ScalarType> out_dtype,
     Tensor& out) {
   // TODO (jakeszwe): improve these to account for the size of out in relation
   // to weight and indices accounting for a possible batch dimension
@@ -316,11 +316,11 @@ Tensor& quantized_embedding_byte_dtype_out(
     KernelRuntimeContext& context,
     const Tensor& weight,
     const Tensor& weight_scales,
-    const exec_aten::optional<Tensor>& opt_weight_zero_points,
+    const executorch::aten::optional<Tensor>& opt_weight_zero_points,
     int64_t weight_quant_min,
     int64_t weight_quant_max,
     const Tensor& indices,
-    exec_aten::optional<ScalarType> out_dtype,
+    executorch::aten::optional<ScalarType> out_dtype,
     Tensor& out) {
   // TODO(larryliu): Add a context arg to the real op function and remove this
   // wrapper
diff --git a/kernels/quantized/cpu/op_embedding2b.cpp b/kernels/quantized/cpu/op_embedding2b.cpp
index a2d2f8eb39..9d274d38e1 100644
--- a/kernels/quantized/cpu/op_embedding2b.cpp
+++ b/kernels/quantized/cpu/op_embedding2b.cpp
@@ -16,9 +16,9 @@ namespace torch {
 namespace executor {
 namespace native {
 
-using Tensor = exec_aten::Tensor;
-using Scalar = exec_aten::Scalar;
-using ScalarType = exec_aten::ScalarType;
+using Tensor = executorch::aten::Tensor;
+using Scalar = executorch::aten::Scalar;
+using ScalarType = executorch::aten::ScalarType;
 
 /**
  * Retrieves the embeddings specified by indices, dequantizes them, and stores
@@ -37,7 +37,7 @@ Tensor& quantized_embedding_2bit_out(
     // non quant input and returns fp output
     const Tensor& weight,
     const Tensor& weight_scales,
-    const exec_aten::optional<Tensor>& opt_weight_zero_points,
+    const executorch::aten::optional<Tensor>& opt_weight_zero_points,
     const int64_t weight_quant_min,
     const int64_t weight_quant_max,
     const Tensor& indices,
@@ -57,7 +57,7 @@ Tensor& quantized_embedding_2bit_out(
     KernelRuntimeContext& context,
     const Tensor& weight,
     const Tensor& weight_scales,
-    const exec_aten::optional<Tensor>& opt_weight_zero_points,
+    const executorch::aten::optional<Tensor>& opt_weight_zero_points,
     int64_t weight_quant_min,
     int64_t weight_quant_max,
     const Tensor& indices,
@@ -77,11 +77,11 @@ Tensor& quantized_embedding_2bit_out(
 Tensor& quantized_embedding_2bit_dtype_out(
     const Tensor& weight,
     const Tensor& weight_scales,
-    const exec_aten::optional<Tensor>& opt_weight_zero_points,
+    const executorch::aten::optional<Tensor>& opt_weight_zero_points,
     int64_t weight_quant_min,
     int64_t weight_quant_max,
     const Tensor& indices,
-    exec_aten::optional<ScalarType> out_dtype,
+    executorch::aten::optional<ScalarType> out_dtype,
     Tensor& out) {
   return quantized_embedding_xbit_dtype_out(
       weight,
@@ -99,11 +99,11 @@ Tensor& quantized_embedding_2bit_dtype_out(
     KernelRuntimeContext& context,
     const Tensor& weight,
     const Tensor& weight_scales,
-    const exec_aten::optional<Tensor>& opt_weight_zero_points,
+    const executorch::aten::optional<Tensor>& opt_weight_zero_points,
     int64_t weight_quant_min,
     int64_t weight_quant_max,
     const Tensor& indices,
-    exec_aten::optional<ScalarType> out_dtype,
+    executorch::aten::optional<ScalarType> out_dtype,
     Tensor& out) {
   return quantized_embedding_xbit_dtype_out(
       context,
diff --git a/kernels/quantized/cpu/op_embedding4b.cpp b/kernels/quantized/cpu/op_embedding4b.cpp
index d123b40b35..773b7c5c10 100644
--- a/kernels/quantized/cpu/op_embedding4b.cpp
+++ b/kernels/quantized/cpu/op_embedding4b.cpp
@@ -16,9 +16,9 @@ namespace torch {
 namespace executor {
 namespace native {
 
-using Tensor = exec_aten::Tensor;
-using Scalar = exec_aten::Scalar;
-using ScalarType = exec_aten::ScalarType;
+using Tensor = executorch::aten::Tensor;
+using Scalar = executorch::aten::Scalar;
+using ScalarType = executorch::aten::ScalarType;
 
 /**
  * Retrieves the embeddings specified by indices, dequantizes them, and stores
@@ -37,7 +37,7 @@ Tensor& quantized_embedding_4bit_out(
     // non quant input and returns fp output
     const Tensor& weight,
     const Tensor& weight_scales,
-    const exec_aten::optional<Tensor>& opt_weight_zero_points,
+    const executorch::aten::optional<Tensor>& opt_weight_zero_points,
     const int64_t weight_quant_min,
     const int64_t weight_quant_max,
     const Tensor& indices,
@@ -57,7 +57,7 @@ Tensor& quantized_embedding_4bit_out(
     KernelRuntimeContext& context,
     const Tensor& weight,
     const Tensor& weight_scales,
-    const exec_aten::optional<Tensor>& opt_weight_zero_points,
+    const executorch::aten::optional<Tensor>& opt_weight_zero_points,
     int64_t weight_quant_min,
     int64_t weight_quant_max,
     const Tensor& indices,
@@ -79,11 +79,11 @@ Tensor& quantized_embedding_4bit_dtype_out(
     // non quant input and returns fp output
     const Tensor& weight,
     const Tensor& weight_scales,
-    const exec_aten::optional<Tensor>& opt_weight_zero_points,
+    const executorch::aten::optional<Tensor>& opt_weight_zero_points,
     const int64_t weight_quant_min,
     const int64_t weight_quant_max,
     const Tensor& indices,
-    exec_aten::optional<ScalarType> out_dtype,
+    executorch::aten::optional<ScalarType> out_dtype,
     Tensor& out) {
   return quantized_embedding_xbit_dtype_out(
       weight,
@@ -101,11 +101,11 @@ Tensor& quantized_embedding_4bit_dtype_out(
     KernelRuntimeContext& context,
     const Tensor& weight,
     const Tensor& weight_scales,
-    const exec_aten::optional<Tensor>& opt_weight_zero_points,
+    const executorch::aten::optional<Tensor>& opt_weight_zero_points,
     int64_t weight_quant_min,
     int64_t weight_quant_max,
     const Tensor& indices,
-    exec_aten::optional<ScalarType> out_dtype,
+    executorch::aten::optional<ScalarType> out_dtype,
     Tensor& out) {
   return quantized_embedding_xbit_dtype_out(
       context,
diff --git a/kernels/quantized/cpu/op_mixed_linear.cpp b/kernels/quantized/cpu/op_mixed_linear.cpp
index af3d10cedb..d09d0bdd5e 100644
--- a/kernels/quantized/cpu/op_mixed_linear.cpp
+++ b/kernels/quantized/cpu/op_mixed_linear.cpp
@@ -13,14 +13,14 @@ namespace torch {
 namespace executor {
 namespace native {
 
-using Tensor = exec_aten::Tensor;
+using Tensor = executorch::aten::Tensor;
 
 bool check_quantized_mixed_linear_args(
     const Tensor& in,
     const Tensor& weight,
     const Tensor& weight_scales,
-    const exec_aten::optional<Tensor>& opt_weight_zero_points,
-    const exec_aten::optional<ScalarType> dtype,
+    const executorch::aten::optional<Tensor>& opt_weight_zero_points,
+    const executorch::aten::optional<ScalarType> dtype,
     Tensor& out) {
   ET_LOG_AND_RETURN_IF_FALSE(tensor_is_rank(in, 2));
   ET_LOG_AND_RETURN_IF_FALSE(tensor_is_rank(weight, 2));
@@ -64,8 +64,8 @@ Tensor& quantized_mixed_linear_out(
     const Tensor& in,
     const Tensor& weight,
     const Tensor& weight_scales,
-    const exec_aten::optional<Tensor>& opt_weight_zero_points,
-    const exec_aten::optional<ScalarType> dtype,
+    const executorch::aten::optional<Tensor>& opt_weight_zero_points,
+    const executorch::aten::optional<ScalarType> dtype,
     Tensor& out) {
   // TODO (gjcomer) Replace with ET_KERNEL_CHECK when context is available.
   ET_CHECK(check_quantized_mixed_linear_args(
@@ -74,7 +74,7 @@ Tensor& quantized_mixed_linear_out(
   ScalarType out_dtype = dtype.has_value() ? dtype.value() : out.scalar_type();
 
   size_t output_ndim = 2;
-  exec_aten::SizesType output_sizes[kTensorDimensionLimit];
+  executorch::aten::SizesType output_sizes[kTensorDimensionLimit];
   output_sizes[0] = in.size(0);
   output_sizes[1] = weight.size(0);
 
@@ -117,8 +117,8 @@ Tensor& quantized_mixed_linear_out(
     const Tensor& in,
     const Tensor& weight,
     const Tensor& weight_scales,
-    const exec_aten::optional<Tensor>& opt_weight_zero_points,
-    const exec_aten::optional<ScalarType> dtype,
+    const executorch::aten::optional<Tensor>& opt_weight_zero_points,
+    const executorch::aten::optional<ScalarType> dtype,
     Tensor& out) {
   // TODO(mcandales): Remove the need for this wrapper
   // TODO(mkg): add support for dtype
diff --git a/kernels/quantized/cpu/op_mixed_mm.cpp b/kernels/quantized/cpu/op_mixed_mm.cpp
index 18d8f1e70d..044e110bf5 100644
--- a/kernels/quantized/cpu/op_mixed_mm.cpp
+++ b/kernels/quantized/cpu/op_mixed_mm.cpp
@@ -13,13 +13,13 @@ namespace torch {
 namespace executor {
 namespace native {
 
-using Tensor = exec_aten::Tensor;
+using Tensor = executorch::aten::Tensor;
 
 bool check_quantized_mixed_mm_args(
     const Tensor& in,
     const Tensor& weight,
     const Tensor& weight_scales,
-    const exec_aten::optional<Tensor>& opt_weight_zero_points,
+    const executorch::aten::optional<Tensor>& opt_weight_zero_points,
     Tensor& out) {
   ET_LOG_AND_RETURN_IF_FALSE(tensor_is_rank(in, 2));
   ET_LOG_AND_RETURN_IF_FALSE(tensor_is_rank(weight, 2));
@@ -55,13 +55,13 @@ Tensor& quantized_mixed_mm_out(
     const Tensor& in,
     const Tensor& weight,
     const Tensor& weight_scales,
-    const exec_aten::optional<Tensor>& opt_weight_zero_points,
+    const executorch::aten::optional<Tensor>& opt_weight_zero_points,
     Tensor& out) {
   ET_CHECK(check_quantized_mixed_mm_args(
       in, weight, weight_scales, opt_weight_zero_points, out));
 
   size_t output_ndim = 2;
-  exec_aten::SizesType output_sizes[kTensorDimensionLimit];
+  executorch::aten::SizesType output_sizes[kTensorDimensionLimit];
   output_sizes[0] = in.size(0);
   output_sizes[1] = weight.size(1);
 
@@ -92,7 +92,7 @@ Tensor& quantized_mixed_mm_out(
     const Tensor& in,
     const Tensor& weight,
     const Tensor& weight_scales,
-    const exec_aten::optional<Tensor>& opt_weight_zero_points,
+    const executorch::aten::optional<Tensor>& opt_weight_zero_points,
     Tensor& out) {
   // TODO(mcandales): Remove the need for this wrapper
   (void)ctx;
diff --git a/kernels/quantized/cpu/op_quantize.cpp b/kernels/quantized/cpu/op_quantize.cpp
index 74c21e9f46..5079109683 100644
--- a/kernels/quantized/cpu/op_quantize.cpp
+++ b/kernels/quantized/cpu/op_quantize.cpp
@@ -19,9 +19,9 @@ namespace torch {
 namespace executor {
 namespace native {
 
-using Tensor = exec_aten::Tensor;
-using Scalar = exec_aten::Scalar;
-using ScalarType = exec_aten::ScalarType;
+using Tensor = executorch::aten::Tensor;
+using Scalar = executorch::aten::Scalar;
+using ScalarType = executorch::aten::ScalarType;
 
 namespace {
 
@@ -293,8 +293,9 @@ Tensor& quantize_per_channel_out(
   const double* scale_data = scale.const_data_ptr<double>();
   const int64_t* zero_point_data = zero_point.const_data_ptr<int64_t>();
 
-  exec_aten::optional<exec_aten::ArrayRef<int64_t>> optional_dim_list{
-      exec_aten::ArrayRef<int64_t>{dims, size_t(input.dim() - 1)}};
+  executorch::aten::optional<executorch::aten::ArrayRef<int64_t>>
+      optional_dim_list{
+          executorch::aten::ArrayRef<int64_t>{dims, size_t(input.dim() - 1)}};
 
   // Actual quantization logic
   // input, out are the input and output tensors
@@ -399,11 +400,11 @@ Tensor& quantize_per_token_out(
   Tensor reshaped_input = at::from_blob(
       input.mutable_data_ptr(), sizes, at::TensorOptions(input.scalar_type()));
 #else
-  std::array<exec_aten::DimOrderType, 2> input_dim_order{0, 1};
-  std::array<exec_aten::SizesType, 2> input_sizes;
+  std::array<executorch::aten::DimOrderType, 2> input_dim_order{0, 1};
+  std::array<executorch::aten::SizesType, 2> input_sizes;
   input_sizes[0] = num_tokens;
   input_sizes[1] = input.size(input.dim() - 1);
-  std::array<exec_aten::StridesType, 2> input_strides;
+  std::array<executorch::aten::StridesType, 2> input_strides;
   dim_order_to_stride_nocheck(
       input_sizes.data(), input_dim_order.data(), 2, input_strides.data());
   void* input_data = input.mutable_data_ptr();
diff --git a/kernels/quantized/test/op_add_test.cpp b/kernels/quantized/test/op_add_test.cpp
index 573d9b1dca..17dd1cfb3f 100644
--- a/kernels/quantized/test/op_add_test.cpp
+++ b/kernels/quantized/test/op_add_test.cpp
@@ -19,11 +19,11 @@
 #include <limits>
 
 using namespace ::testing;
-using exec_aten::ArrayRef;
-using exec_aten::optional;
-using exec_aten::Scalar;
-using exec_aten::ScalarType;
-using exec_aten::Tensor;
+using executorch::aten::ArrayRef;
+using executorch::aten::optional;
+using executorch::aten::Scalar;
+using executorch::aten::ScalarType;
+using executorch::aten::Tensor;
 using executorch::runtime::KernelRuntimeContext;
 using torch::executor::native::add_out;
 using torch::executor::native::dequantize_per_tensor_out;
@@ -34,7 +34,7 @@ using torch::executor::testing::TensorFactory;
 
 /// A generic smoke test that works for any dtype that supports ones() and
 /// zeros().
-template <exec_aten::ScalarType DTYPE>
+template <executorch::aten::ScalarType DTYPE>
 void test_dtype() {
   TensorFactory<ScalarType::Float> tf;
 
diff --git a/kernels/quantized/test/op_choose_qparams_test.cpp b/kernels/quantized/test/op_choose_qparams_test.cpp
index 5cc3fc2116..13426bfdd8 100644
--- a/kernels/quantized/test/op_choose_qparams_test.cpp
+++ b/kernels/quantized/test/op_choose_qparams_test.cpp
@@ -18,10 +18,10 @@
 #include <limits>
 
 using namespace ::testing;
-using exec_aten::ArrayRef;
-using exec_aten::Scalar;
-using exec_aten::ScalarType;
-using exec_aten::Tensor;
+using executorch::aten::ArrayRef;
+using executorch::aten::Scalar;
+using executorch::aten::ScalarType;
+using executorch::aten::Tensor;
 using torch::executor::native::choose_qparams_per_token_asymmetric_out;
 using torch::executor::native::choose_qparams_tensor_out;
 using torch::executor::testing::TensorFactory;
diff --git a/kernels/quantized/test/op_dequantize_test.cpp b/kernels/quantized/test/op_dequantize_test.cpp
index 676aa32690..934b477730 100644
--- a/kernels/quantized/test/op_dequantize_test.cpp
+++ b/kernels/quantized/test/op_dequantize_test.cpp
@@ -18,11 +18,11 @@
 #include <limits>
 
 using namespace ::testing;
-using exec_aten::ArrayRef;
-using exec_aten::optional;
-using exec_aten::Scalar;
-using exec_aten::ScalarType;
-using exec_aten::Tensor;
+using executorch::aten::ArrayRef;
+using executorch::aten::optional;
+using executorch::aten::Scalar;
+using executorch::aten::ScalarType;
+using executorch::aten::Tensor;
 using torch::executor::native::dequantize_per_channel_out;
 using torch::executor::native::dequantize_per_tensor_out;
 using torch::executor::native::dequantize_per_tensor_tensor_args_out;
diff --git a/kernels/quantized/test/op_embedding2b_test.cpp b/kernels/quantized/test/op_embedding2b_test.cpp
index 1e4633f30c..bf48fa4227 100644
--- a/kernels/quantized/test/op_embedding2b_test.cpp
+++ b/kernels/quantized/test/op_embedding2b_test.cpp
@@ -17,10 +17,10 @@
 #include <limits>
 
 using namespace ::testing;
-using exec_aten::ArrayRef;
-using exec_aten::optional;
-using exec_aten::ScalarType;
-using exec_aten::Tensor;
+using executorch::aten::ArrayRef;
+using executorch::aten::optional;
+using executorch::aten::ScalarType;
+using executorch::aten::Tensor;
 using executorch::runtime::KernelRuntimeContext;
 using torch::executor::native::quantized_embedding_2bit_out;
 
diff --git a/kernels/quantized/test/op_embedding4b_test.cpp b/kernels/quantized/test/op_embedding4b_test.cpp
index 4125e557d4..9f205be80e 100644
--- a/kernels/quantized/test/op_embedding4b_test.cpp
+++ b/kernels/quantized/test/op_embedding4b_test.cpp
@@ -17,10 +17,10 @@
 #include <limits>
 
 using namespace ::testing;
-using exec_aten::ArrayRef;
-using exec_aten::optional;
-using exec_aten::ScalarType;
-using exec_aten::Tensor;
+using executorch::aten::ArrayRef;
+using executorch::aten::optional;
+using executorch::aten::ScalarType;
+using executorch::aten::Tensor;
 using executorch::runtime::KernelRuntimeContext;
 using torch::executor::native::quantized_embedding_4bit_out;
 
diff --git a/kernels/quantized/test/op_embedding_test.cpp b/kernels/quantized/test/op_embedding_test.cpp
index cc0abe3cb9..252aca4131 100644
--- a/kernels/quantized/test/op_embedding_test.cpp
+++ b/kernels/quantized/test/op_embedding_test.cpp
@@ -19,11 +19,11 @@
 #include <limits>
 
 using namespace ::testing;
-using exec_aten::ArrayRef;
-using exec_aten::optional;
-using exec_aten::Scalar;
-using exec_aten::ScalarType;
-using exec_aten::Tensor;
+using executorch::aten::ArrayRef;
+using executorch::aten::optional;
+using executorch::aten::Scalar;
+using executorch::aten::ScalarType;
+using executorch::aten::Tensor;
 using executorch::runtime::KernelRuntimeContext;
 using torch::executor::native::dequantize_per_tensor_out;
 using torch::executor::native::embedding_out;
@@ -34,7 +34,7 @@ using torch::executor::testing::TensorFactory;
 
 /// A generic smoke test that works for any dtype that supports ones() and
 /// zeros().
-template <exec_aten::ScalarType DTYPE>
+template <executorch::aten::ScalarType DTYPE>
 void test_dtype() {
   TensorFactory<ScalarType::Float> tf;
   TensorFactory<ScalarType::Long> tf_l;
diff --git a/kernels/quantized/test/op_mixed_linear_test.cpp b/kernels/quantized/test/op_mixed_linear_test.cpp
index 5728134d98..6b86b199f6 100644
--- a/kernels/quantized/test/op_mixed_linear_test.cpp
+++ b/kernels/quantized/test/op_mixed_linear_test.cpp
@@ -17,9 +17,9 @@
 #include <gtest/gtest.h>
 
 using namespace ::testing;
-using exec_aten::optional;
-using exec_aten::ScalarType;
-using exec_aten::Tensor;
+using executorch::aten::optional;
+using executorch::aten::ScalarType;
+using executorch::aten::Tensor;
 using executorch::runtime::KernelRuntimeContext;
 using torch::executor::native::quantized_mixed_linear_out;
 using torch::executor::testing::TensorFactory;
diff --git a/kernels/quantized/test/op_mixed_mm_test.cpp b/kernels/quantized/test/op_mixed_mm_test.cpp
index 0dc71abc7a..e20ac96d61 100644
--- a/kernels/quantized/test/op_mixed_mm_test.cpp
+++ b/kernels/quantized/test/op_mixed_mm_test.cpp
@@ -17,9 +17,9 @@
 #include <gtest/gtest.h>
 
 using namespace ::testing;
-using exec_aten::optional;
-using exec_aten::ScalarType;
-using exec_aten::Tensor;
+using executorch::aten::optional;
+using executorch::aten::ScalarType;
+using executorch::aten::Tensor;
 using executorch::runtime::KernelRuntimeContext;
 using torch::executor::native::quantized_mixed_mm_out;
 using torch::executor::testing::TensorFactory;
diff --git a/kernels/quantized/test/op_quantize_test.cpp b/kernels/quantized/test/op_quantize_test.cpp
index 384ba630c5..704d8d06c5 100644
--- a/kernels/quantized/test/op_quantize_test.cpp
+++ b/kernels/quantized/test/op_quantize_test.cpp
@@ -17,10 +17,10 @@
 #include <limits>
 
 using namespace ::testing;
-using exec_aten::ArrayRef;
-using exec_aten::Scalar;
-using exec_aten::ScalarType;
-using exec_aten::Tensor;
+using executorch::aten::ArrayRef;
+using executorch::aten::Scalar;
+using executorch::aten::ScalarType;
+using executorch::aten::Tensor;
 using torch::executor::native::quantize_per_channel_out;
 using torch::executor::native::quantize_per_tensor_out;
 using torch::executor::native::quantize_per_tensor_tensor_args_out;
diff --git a/kernels/quantized/test/quantized_ops_aot_register.cpp b/kernels/quantized/test/quantized_ops_aot_register.cpp
index e20f719c1e..72da666d69 100644
--- a/kernels/quantized/test/quantized_ops_aot_register.cpp
+++ b/kernels/quantized/test/quantized_ops_aot_register.cpp
@@ -36,7 +36,7 @@ Tensor& quantize_per_token_out_no_context(
     int64_t quant_max,
     ScalarType dtype,
     Tensor& out) {
-  exec_aten::RuntimeContext context{};
+  executorch::aten::RuntimeContext context{};
   ::torch::executor::runtime_init();
   quantize_per_token_out(
       context, input, scale, zero_point, quant_min, quant_max, dtype, out);
@@ -78,7 +78,7 @@ Tensor& dequantize_per_token_out_no_context(
     ScalarType dtype,
     ScalarType out_dtype,
     Tensor& out) {
-  exec_aten::RuntimeContext context{};
+  executorch::aten::RuntimeContext context{};
   ::torch::executor::runtime_init();
   dequantize_per_token_out(
       context,
diff --git a/kernels/test/BinaryLogicalOpTest.h b/kernels/test/BinaryLogicalOpTest.h
index 0cf412c337..b1b7d3c5e8 100644
--- a/kernels/test/BinaryLogicalOpTest.h
+++ b/kernels/test/BinaryLogicalOpTest.h
@@ -18,24 +18,24 @@ class BinaryLogicalOpTest : public OperatorTest {
  protected:
   // Implement this to call the torch::executor::aten::op_outf function for the
   // op.
-  virtual exec_aten::Tensor& op_out(
-      const exec_aten::Tensor& lhs,
-      const exec_aten::Tensor& rhs,
-      exec_aten::Tensor& out) = 0;
+  virtual executorch::aten::Tensor& op_out(
+      const executorch::aten::Tensor& lhs,
+      const executorch::aten::Tensor& rhs,
+      executorch::aten::Tensor& out) = 0;
 
   // Scalar reference implementation of the function in question for testing.
   virtual double op_reference(double x, double y) const = 0;
 
   template <
-      exec_aten::ScalarType IN_DTYPE,
-      exec_aten::ScalarType IN_DTYPE2,
-      exec_aten::ScalarType OUT_DTYPE>
+      executorch::aten::ScalarType IN_DTYPE,
+      executorch::aten::ScalarType IN_DTYPE2,
+      executorch::aten::ScalarType OUT_DTYPE>
   void test_op_out() {
     TensorFactory<IN_DTYPE> tf_in;
     TensorFactory<IN_DTYPE2> tf_in2;
     TensorFactory<OUT_DTYPE> tf_out;
 
-    exec_aten::Tensor out = tf_out.zeros({1, 4});
+    executorch::aten::Tensor out = tf_out.zeros({1, 4});
 
     using CTYPE1 = typename decltype(tf_in)::ctype;
     std::vector<CTYPE1> test_vector1 = {0, CTYPE1(-1), CTYPE1(0), CTYPE1(31)};
diff --git a/kernels/test/UnaryUfuncRealHBBF16ToFloatHBF16Test.cpp b/kernels/test/UnaryUfuncRealHBBF16ToFloatHBF16Test.cpp
index e3c38fc471..74ad17df78 100644
--- a/kernels/test/UnaryUfuncRealHBBF16ToFloatHBF16Test.cpp
+++ b/kernels/test/UnaryUfuncRealHBBF16ToFloatHBF16Test.cpp
@@ -10,14 +10,14 @@
 
 namespace torch::executor::testing {
 void UnaryUfuncRealHBBF16ToFloatHBF16Test::test_bool_input() {
-  TensorFactory<exec_aten::ScalarType::Bool> tf_bool;
-  TensorFactory<exec_aten::ScalarType::Float> tf_float;
+  TensorFactory<executorch::aten::ScalarType::Bool> tf_bool;
+  TensorFactory<executorch::aten::ScalarType::Float> tf_float;
 
   const std::vector<int32_t> sizes = {1, 2};
 
-  exec_aten::Tensor a = tf_bool.make(sizes, /*data=*/{false, true});
-  exec_aten::Tensor out = tf_float.zeros(sizes);
-  exec_aten::Tensor res = tf_float.make(
+  executorch::aten::Tensor a = tf_bool.make(sizes, /*data=*/{false, true});
+  executorch::aten::Tensor out = tf_float.zeros(sizes);
+  executorch::aten::Tensor res = tf_float.make(
       sizes,
       /*data=*/{(float)op_reference(false), (float)op_reference(true)});
 
@@ -28,94 +28,94 @@ void UnaryUfuncRealHBBF16ToFloatHBF16Test::test_mismatched_input_shapes_dies() {
   if (get_supported_features()->is_aten) {
     GTEST_SKIP() << "ATen kernel can handle mismatched input shapes";
   }
-  TensorFactory<exec_aten::ScalarType::Float> tf;
+  TensorFactory<executorch::aten::ScalarType::Float> tf;
 
-  exec_aten::Tensor a = tf.ones(/*sizes=*/{4});
-  exec_aten::Tensor out = tf.ones(/*sizes=*/{2, 2});
+  executorch::aten::Tensor a = tf.ones(/*sizes=*/{4});
+  executorch::aten::Tensor out = tf.ones(/*sizes=*/{2, 2});
 
   ET_EXPECT_KERNEL_FAILURE(context_, op_out(a, out));
 }
 
 void UnaryUfuncRealHBBF16ToFloatHBF16Test::
     test_all_real_input_half_output_static_dynamism_support() {
-#define TEST_ENTRY(ctype, dtype)    \
-  test_floating_point_op_out<       \
-      exec_aten::ScalarType::dtype, \
-      exec_aten::ScalarType::Half>();
+#define TEST_ENTRY(ctype, dtype)           \
+  test_floating_point_op_out<              \
+      executorch::aten::ScalarType::dtype, \
+      executorch::aten::ScalarType::Half>();
   ET_FORALL_REALHBF16_TYPES(TEST_ENTRY);
 #undef TEST_ENTRY
 }
 
 void UnaryUfuncRealHBBF16ToFloatHBF16Test::
     test_all_real_input_bfloat16_output_static_dynamism_support() {
-#define TEST_ENTRY(ctype, dtype)    \
-  test_floating_point_op_out<       \
-      exec_aten::ScalarType::dtype, \
-      exec_aten::ScalarType::BFloat16>();
+#define TEST_ENTRY(ctype, dtype)           \
+  test_floating_point_op_out<              \
+      executorch::aten::ScalarType::dtype, \
+      executorch::aten::ScalarType::BFloat16>();
   ET_FORALL_REALHBF16_TYPES(TEST_ENTRY);
 #undef TEST_ENTRY
 }
 
 void UnaryUfuncRealHBBF16ToFloatHBF16Test::
     test_all_real_input_float_output_static_dynamism_support() {
-#define TEST_ENTRY(ctype, dtype)    \
-  test_floating_point_op_out<       \
-      exec_aten::ScalarType::dtype, \
-      exec_aten::ScalarType::Float>();
+#define TEST_ENTRY(ctype, dtype)           \
+  test_floating_point_op_out<              \
+      executorch::aten::ScalarType::dtype, \
+      executorch::aten::ScalarType::Float>();
   ET_FORALL_REALH_TYPES(TEST_ENTRY);
 #undef TEST_ENTRY
 }
 
 void UnaryUfuncRealHBBF16ToFloatHBF16Test::
     test_all_real_input_double_output_static_dynamism_support() {
-#define TEST_ENTRY(ctype, dtype)    \
-  test_floating_point_op_out<       \
-      exec_aten::ScalarType::dtype, \
-      exec_aten::ScalarType::Double>();
+#define TEST_ENTRY(ctype, dtype)           \
+  test_floating_point_op_out<              \
+      executorch::aten::ScalarType::dtype, \
+      executorch::aten::ScalarType::Double>();
   ET_FORALL_REALH_TYPES(TEST_ENTRY);
 #undef TEST_ENTRY
 }
 
 void UnaryUfuncRealHBBF16ToFloatHBF16Test::
     test_all_real_input_half_output_bound_dynamism_support() {
-#define TEST_ENTRY(ctype, dtype)    \
-  test_floating_point_op_out<       \
-      exec_aten::ScalarType::dtype, \
-      exec_aten::ScalarType::Half>( \
-      {10, 10}, exec_aten::TensorShapeDynamism::DYNAMIC_BOUND);
+#define TEST_ENTRY(ctype, dtype)           \
+  test_floating_point_op_out<              \
+      executorch::aten::ScalarType::dtype, \
+      executorch::aten::ScalarType::Half>( \
+      {10, 10}, executorch::aten::TensorShapeDynamism::DYNAMIC_BOUND);
   ET_FORALL_REALHBF16_TYPES(TEST_ENTRY);
 #undef TEST_ENTRY
 }
 
 void UnaryUfuncRealHBBF16ToFloatHBF16Test::
     test_all_real_input_bfloat16_output_bound_dynamism_support() {
-#define TEST_ENTRY(ctype, dtype)        \
-  test_floating_point_op_out<           \
-      exec_aten::ScalarType::dtype,     \
-      exec_aten::ScalarType::BFloat16>( \
-      {10, 10}, exec_aten::TensorShapeDynamism::DYNAMIC_BOUND);
+#define TEST_ENTRY(ctype, dtype)               \
+  test_floating_point_op_out<                  \
+      executorch::aten::ScalarType::dtype,     \
+      executorch::aten::ScalarType::BFloat16>( \
+      {10, 10}, executorch::aten::TensorShapeDynamism::DYNAMIC_BOUND);
   ET_FORALL_REALHBF16_TYPES(TEST_ENTRY);
 #undef TEST_ENTRY
 }
 
 void UnaryUfuncRealHBBF16ToFloatHBF16Test::
     test_all_real_input_float_output_bound_dynamism_support() {
-#define TEST_ENTRY(ctype, dtype)     \
-  test_floating_point_op_out<        \
-      exec_aten::ScalarType::dtype,  \
-      exec_aten::ScalarType::Float>( \
-      {10, 10}, exec_aten::TensorShapeDynamism::DYNAMIC_BOUND);
+#define TEST_ENTRY(ctype, dtype)            \
+  test_floating_point_op_out<               \
+      executorch::aten::ScalarType::dtype,  \
+      executorch::aten::ScalarType::Float>( \
+      {10, 10}, executorch::aten::TensorShapeDynamism::DYNAMIC_BOUND);
   ET_FORALL_REALH_TYPES(TEST_ENTRY);
 #undef TEST_ENTRY
 }
 
 void UnaryUfuncRealHBBF16ToFloatHBF16Test::
     test_all_real_input_double_output_bound_dynamism_support() {
-#define TEST_ENTRY(ctype, dtype)      \
-  test_floating_point_op_out<         \
-      exec_aten::ScalarType::dtype,   \
-      exec_aten::ScalarType::Double>( \
-      {10, 10}, exec_aten::TensorShapeDynamism::DYNAMIC_BOUND);
+#define TEST_ENTRY(ctype, dtype)             \
+  test_floating_point_op_out<                \
+      executorch::aten::ScalarType::dtype,   \
+      executorch::aten::ScalarType::Double>( \
+      {10, 10}, executorch::aten::TensorShapeDynamism::DYNAMIC_BOUND);
   ET_FORALL_REALH_TYPES(TEST_ENTRY);
 #undef TEST_ENTRY
 }
@@ -125,11 +125,11 @@ void UnaryUfuncRealHBBF16ToFloatHBF16Test::
   if (!get_supported_features()->is_aten) {
     GTEST_SKIP() << "Dynamic shape unbound not supported";
   }
-#define TEST_ENTRY(ctype, dtype)     \
-  test_floating_point_op_out<        \
-      exec_aten::ScalarType::dtype,  \
-      exec_aten::ScalarType::Float>( \
-      {1, 1}, exec_aten::TensorShapeDynamism::DYNAMIC_UNBOUND);
+#define TEST_ENTRY(ctype, dtype)            \
+  test_floating_point_op_out<               \
+      executorch::aten::ScalarType::dtype,  \
+      executorch::aten::ScalarType::Float>( \
+      {1, 1}, executorch::aten::TensorShapeDynamism::DYNAMIC_UNBOUND);
   ET_FORALL_REALH_TYPES(TEST_ENTRY);
 #undef TEST_ENTRY
 }
@@ -139,20 +139,20 @@ void UnaryUfuncRealHBBF16ToFloatHBF16Test::
   if (!get_supported_features()->is_aten) {
     GTEST_SKIP() << "Dynamic shape unbound not supported";
   }
-#define TEST_ENTRY(ctype, dtype)      \
-  test_floating_point_op_out<         \
-      exec_aten::ScalarType::dtype,   \
-      exec_aten::ScalarType::Double>( \
-      {1, 1}, exec_aten::TensorShapeDynamism::DYNAMIC_UNBOUND);
+#define TEST_ENTRY(ctype, dtype)             \
+  test_floating_point_op_out<                \
+      executorch::aten::ScalarType::dtype,   \
+      executorch::aten::ScalarType::Double>( \
+      {1, 1}, executorch::aten::TensorShapeDynamism::DYNAMIC_UNBOUND);
   ET_FORALL_REALH_TYPES(TEST_ENTRY);
 #undef TEST_ENTRY
 }
 
 void UnaryUfuncRealHBBF16ToFloatHBF16Test::test_non_float_output_dtype_dies() {
-#define TEST_ENTRY(ctype, dtype)     \
-  test_op_invalid_output_dtype_dies< \
-      exec_aten::ScalarType::Float,  \
-      exec_aten::ScalarType::dtype>();
+#define TEST_ENTRY(ctype, dtype)           \
+  test_op_invalid_output_dtype_dies<       \
+      executorch::aten::ScalarType::Float, \
+      executorch::aten::ScalarType::dtype>();
   ET_FORALL_INT_TYPES(TEST_ENTRY);
 #undef TEST_ENTRY
 }
diff --git a/kernels/test/UnaryUfuncRealHBBF16ToFloatHBF16Test.h b/kernels/test/UnaryUfuncRealHBBF16ToFloatHBF16Test.h
index eeeb89b6ec..6e49dd9e57 100644
--- a/kernels/test/UnaryUfuncRealHBBF16ToFloatHBF16Test.h
+++ b/kernels/test/UnaryUfuncRealHBBF16ToFloatHBF16Test.h
@@ -24,9 +24,9 @@ class UnaryUfuncRealHBBF16ToFloatHBF16Test : public OperatorTest {
  protected:
   // Implement this to call the torch::executor::aten::op_outf function for the
   // op.
-  virtual exec_aten::Tensor& op_out(
-      const exec_aten::Tensor& self,
-      exec_aten::Tensor& out) = 0;
+  virtual executorch::aten::Tensor& op_out(
+      const executorch::aten::Tensor& self,
+      executorch::aten::Tensor& out) = 0;
 
   // Scalar reference implementation of the function in question for testing.
   virtual double op_reference(double x) const = 0;
@@ -40,15 +40,17 @@ class UnaryUfuncRealHBBF16ToFloatHBF16Test : public OperatorTest {
   // in IMPLEMENT_UNARY_UFUNC_REALHB_TO_FLOATH_TEST.
   virtual SupportedFeatures* get_supported_features() const = 0;
 
-  template <exec_aten::ScalarType IN_DTYPE, exec_aten::ScalarType OUT_DTYPE>
+  template <
+      executorch::aten::ScalarType IN_DTYPE,
+      executorch::aten::ScalarType OUT_DTYPE>
   void test_floating_point_op_out(
       const std::vector<int32_t>& out_shape = {1, 6},
-      exec_aten::TensorShapeDynamism dynamism =
-          exec_aten::TensorShapeDynamism::STATIC) {
+      executorch::aten::TensorShapeDynamism dynamism =
+          executorch::aten::TensorShapeDynamism::STATIC) {
     TensorFactory<IN_DTYPE> tf_in;
     TensorFactory<OUT_DTYPE> tf_out;
 
-    exec_aten::Tensor out = tf_out.zeros(out_shape, dynamism);
+    executorch::aten::Tensor out = tf_out.zeros(out_shape, dynamism);
 
     using IN_CTYPE = typename decltype(tf_in)::ctype;
     using OUT_CTYPE = typename decltype(tf_out)::ctype;
@@ -93,16 +95,16 @@ class UnaryUfuncRealHBBF16ToFloatHBF16Test : public OperatorTest {
 
   // Unhandled output dtypes.
   template <
-      exec_aten::ScalarType INPUT_DTYPE,
-      exec_aten::ScalarType OUTPUT_DTYPE>
+      executorch::aten::ScalarType INPUT_DTYPE,
+      executorch::aten::ScalarType OUTPUT_DTYPE>
   void test_op_invalid_output_dtype_dies() {
     TensorFactory<INPUT_DTYPE> tf;
     TensorFactory<OUTPUT_DTYPE> tf_out;
 
     const std::vector<int32_t> sizes = {2, 5};
 
-    exec_aten::Tensor in = tf.ones(sizes);
-    exec_aten::Tensor out = tf_out.zeros(sizes);
+    executorch::aten::Tensor in = tf.ones(sizes);
+    executorch::aten::Tensor out = tf_out.zeros(sizes);
 
     ET_EXPECT_KERNEL_FAILURE(context_, op_out(in, out));
   }
diff --git a/kernels/test/custom_kernel_example/op_relu.cpp b/kernels/test/custom_kernel_example/op_relu.cpp
index b5faa7a013..2cc3eefe0a 100644
--- a/kernels/test/custom_kernel_example/op_relu.cpp
+++ b/kernels/test/custom_kernel_example/op_relu.cpp
@@ -15,8 +15,8 @@
 namespace my_custom_kernels {
 namespace native {
 
-using exec_aten::ScalarType;
-using exec_aten::Tensor;
+using executorch::aten::ScalarType;
+using executorch::aten::Tensor;
 using executorch::runtime::Error;
 using executorch::runtime::KernelRuntimeContext;
 using executorch::runtime::resize_tensor;
diff --git a/kernels/test/op__empty_dim_order_test.cpp b/kernels/test/op__empty_dim_order_test.cpp
index 2857bac045..a533133f53 100644
--- a/kernels/test/op__empty_dim_order_test.cpp
+++ b/kernels/test/op__empty_dim_order_test.cpp
@@ -16,12 +16,12 @@
 #include <gtest/gtest.h>
 
 using namespace ::testing;
-using exec_aten::DimOrderType;
-using exec_aten::IntArrayRef;
-using exec_aten::optional;
-using exec_aten::OptionalArrayRef;
-using exec_aten::ScalarType;
-using exec_aten::Tensor;
+using executorch::aten::DimOrderType;
+using executorch::aten::IntArrayRef;
+using executorch::aten::optional;
+using executorch::aten::OptionalArrayRef;
+using executorch::aten::ScalarType;
+using executorch::aten::Tensor;
 using torch::executor::testing::TensorFactory;
 
 class OpEmptyDimOrderOutTest : public OperatorTest {
@@ -38,7 +38,7 @@ class OpEmptyDimOrderOutTest : public OperatorTest {
   void test_op_empty_dim_order_out(std::vector<int32_t>&& size_int32_t) {
     TensorFactory<DTYPE> tf;
     std::vector<int64_t> sizes(size_int32_t.begin(), size_int32_t.end());
-    auto aref = exec_aten::ArrayRef<int64_t>(sizes.data(), sizes.size());
+    auto aref = executorch::aten::ArrayRef<int64_t>(sizes.data(), sizes.size());
     OptionalArrayRef<int64_t> dim_order;
     Tensor out = tf.ones(size_int32_t);
 
@@ -49,7 +49,7 @@ class OpEmptyDimOrderOutTest : public OperatorTest {
     TensorFactory<ScalarType::Float> tf;
 
     int64_t sizes[3] = {3, 2, 4};
-    auto sizes_aref = exec_aten::ArrayRef<int64_t>(sizes);
+    auto sizes_aref = executorch::aten::ArrayRef<int64_t>(sizes);
 
     int64_t raw_dim_order[2] = {0, 1};
     auto dim_order = OptionalArrayRef<int64_t>(raw_dim_order);
@@ -63,7 +63,7 @@ class OpEmptyDimOrderOutTest : public OperatorTest {
     TensorFactory<ScalarType::Float> tf;
 
     int64_t sizes[2] = {3, 2};
-    auto sizes_aref = exec_aten::ArrayRef<int64_t>(sizes);
+    auto sizes_aref = executorch::aten::ArrayRef<int64_t>(sizes);
 
     int64_t raw_dim_order[2] = {1, 2};
     auto dim_order = OptionalArrayRef<int64_t>(raw_dim_order);
@@ -77,7 +77,7 @@ class OpEmptyDimOrderOutTest : public OperatorTest {
     TensorFactory<ScalarType::Float> tf;
 
     int64_t sizes[4] = {3, 2, 4, 5};
-    auto sizes_aref = exec_aten::ArrayRef<int64_t>(sizes);
+    auto sizes_aref = executorch::aten::ArrayRef<int64_t>(sizes);
 
     // should be {0, 2, 3, 1}
     int64_t raw_dim_order[4] = {0, 1, 2, 3};
@@ -102,7 +102,7 @@ TEST_F(OpEmptyDimOrderOutTest, DynamicShapeUpperBoundSameAsExpected) {
   TensorFactory<ScalarType::Float> tf;
 
   int64_t sizes[2] = {3, 2};
-  auto sizes_aref = exec_aten::ArrayRef<int64_t>(sizes);
+  auto sizes_aref = executorch::aten::ArrayRef<int64_t>(sizes);
   OptionalArrayRef<int64_t> dim_order;
   Tensor out =
       tf.ones({3, 2}, torch::executor::TensorShapeDynamism::DYNAMIC_BOUND);
@@ -113,7 +113,7 @@ TEST_F(OpEmptyDimOrderOutTest, ContiguousDimOrderSuccees) {
   TensorFactory<ScalarType::Float> tf;
 
   int64_t sizes[2] = {3, 2};
-  auto sizes_aref = exec_aten::ArrayRef<int64_t>(sizes);
+  auto sizes_aref = executorch::aten::ArrayRef<int64_t>(sizes);
 
   int64_t raw_dim_order[2] = {0, 1};
   auto dim_order = OptionalArrayRef<int64_t>(raw_dim_order);
@@ -126,7 +126,7 @@ TEST_F(OpEmptyDimOrderOutTest, ChannelsLastsDimOrderSuccees) {
   TensorFactory<ScalarType::Float> tf;
 
   int64_t sizes[4] = {3, 2, 4, 5};
-  auto sizes_aref = exec_aten::ArrayRef<int64_t>(sizes);
+  auto sizes_aref = executorch::aten::ArrayRef<int64_t>(sizes);
 
   int64_t raw_dim_order[4] = {0, 2, 3, 1};
   auto dim_order = OptionalArrayRef<int64_t>(raw_dim_order);
@@ -139,7 +139,7 @@ TEST_F(OpEmptyDimOrderOutTest, DynamicShapeUpperBoundLargerThanExpected) {
   TensorFactory<ScalarType::Float> tf;
 
   int64_t sizes[2] = {3, 2};
-  auto sizes_aref = exec_aten::ArrayRef<int64_t>(sizes);
+  auto sizes_aref = executorch::aten::ArrayRef<int64_t>(sizes);
   OptionalArrayRef<int64_t> dim_order;
   Tensor out =
       tf.ones({10, 10}, torch::executor::TensorShapeDynamism::DYNAMIC_BOUND);
@@ -153,7 +153,7 @@ TEST_F(OpEmptyDimOrderOutTest, DynamicShapeUnbound) {
   TensorFactory<ScalarType::Float> tf;
 
   int64_t sizes[2] = {3, 2};
-  auto sizes_aref = exec_aten::ArrayRef<int64_t>(sizes);
+  auto sizes_aref = executorch::aten::ArrayRef<int64_t>(sizes);
   OptionalArrayRef<int64_t> dim_order;
   Tensor out =
       tf.ones({1, 1}, torch::executor::TensorShapeDynamism::DYNAMIC_UNBOUND);
diff --git a/kernels/test/op__to_dim_order_copy_test.cpp b/kernels/test/op__to_dim_order_copy_test.cpp
index 073225a7d6..3fbecac207 100644
--- a/kernels/test/op__to_dim_order_copy_test.cpp
+++ b/kernels/test/op__to_dim_order_copy_test.cpp
@@ -21,10 +21,10 @@
 #include <gtest/gtest.h>
 
 using namespace ::testing;
-using exec_aten::ArrayRef;
-using exec_aten::optional;
-using exec_aten::ScalarType;
-using exec_aten::Tensor;
+using executorch::aten::ArrayRef;
+using executorch::aten::optional;
+using executorch::aten::ScalarType;
+using executorch::aten::Tensor;
 using torch::executor::testing::TensorFactory;
 
 // To further emphasize the accuracy of our op_to, we TEST_F the conversion
@@ -37,8 +37,8 @@ typedef std::map<
           std::variant<
             std::vector<float>,
             std::vector<double>,
-            std::vector<exec_aten::Half>,
-            std::vector<exec_aten::BFloat16>>>
+            std::vector<executorch::aten::Half>,
+            std::vector<executorch::aten::BFloat16>>>
         FloatingTypeToDataMap;
 
 typedef std::map<
@@ -57,7 +57,7 @@ class OpToDimOrderCopyTest : public OperatorTest {
   Tensor& op__to_dim_order_copy_out(
       const Tensor& self,
       bool non_blocking,
-      exec_aten::optional<ArrayRef<int64_t>> dim_order,
+      executorch::aten::optional<ArrayRef<int64_t>> dim_order,
       Tensor& out) {
     return torch::executor::dim_order_ops::_to_dim_order_copy_outf(
         context_, self, non_blocking, dim_order, out);
@@ -415,8 +415,8 @@ TEST_F(OpToDimOrderCopyTest, HardcodeFloatConvertInt) {
       -0.30919688936285893988};
   // clang-format on
 
-  std::vector<exec_aten::Half> half_data;
-  std::vector<exec_aten::BFloat16> bf16_data;
+  std::vector<executorch::aten::Half> half_data;
+  std::vector<executorch::aten::BFloat16> bf16_data;
   for (auto d : double_data) {
     half_data.emplace_back(d);
     bf16_data.emplace_back(d);
@@ -435,8 +435,8 @@ TEST_F(OpToDimOrderCopyTest, HardcodeFloatConvertInt) {
   FloatingTypeToDataMap floating_point_data;
   floating_point_data[typeid(float)] = float_data;
   floating_point_data[typeid(double)] = double_data;
-  floating_point_data[typeid(exec_aten::Half)] = half_data;
-  floating_point_data[typeid(exec_aten::BFloat16)] = bf16_data;
+  floating_point_data[typeid(executorch::aten::Half)] = half_data;
+  floating_point_data[typeid(executorch::aten::BFloat16)] = bf16_data;
 
   // Gathering all int data together for better traversial
   IntTypeToDataMap int_data;
@@ -580,7 +580,7 @@ TEST_F(OpToDimOrderCopyTest, ContiguousToChannelsLast) {
       /*dim_order=*/{0, 2, 3, 1});
 
   std::vector<int64_t> dim_order_vec = {0, 2, 3, 1};
-  exec_aten::ArrayRef<int64_t> dim_order(
+  executorch::aten::ArrayRef<int64_t> dim_order(
       dim_order_vec.data(), dim_order_vec.size());
   Tensor ret = op__to_dim_order_copy_out(
       /*self*/ x, /*non_blocking*/ false, /*dim_order*/ dim_order, out);
@@ -615,7 +615,7 @@ TEST_F(OpToDimOrderCopyTest, ChannelsLastToContiguous) {
        0.8416, 0.4296, 0.7203, 0.8963, 0.3597, 0.5552});
 
   std::vector<int64_t> dim_order_vec = {0, 1, 2, 3};
-  exec_aten::ArrayRef<int64_t> dim_order(
+  executorch::aten::ArrayRef<int64_t> dim_order(
       dim_order_vec.data(), dim_order_vec.size());
   Tensor ret = op__to_dim_order_copy_out(
       /*self*/ x, /*non_blocking*/ false, /*dim_order*/ dim_order, out);
@@ -653,7 +653,7 @@ TEST_F(OpToDimOrderCopyTest, PreserveChanneslLast) {
   Tensor ret = op__to_dim_order_copy_out(
       /*self*/ x,
       /*non_blocking*/ false,
-      /*dim_order*/ exec_aten::nullopt,
+      /*dim_order*/ executorch::aten::nullopt,
       out);
 
   EXPECT_TENSOR_EQ(out, expected);
diff --git a/kernels/test/op_abs_test.cpp b/kernels/test/op_abs_test.cpp
index bdcf6be5ca..0d022d0a83 100644
--- a/kernels/test/op_abs_test.cpp
+++ b/kernels/test/op_abs_test.cpp
@@ -15,8 +15,8 @@
 #include <gtest/gtest.h>
 
 using namespace ::testing;
-using exec_aten::ScalarType;
-using exec_aten::Tensor;
+using executorch::aten::ScalarType;
+using executorch::aten::Tensor;
 using torch::executor::testing::TensorFactory;
 
 class OpAbsTest : public OperatorTest {
@@ -38,6 +38,24 @@ class OpAbsTest : public OperatorTest {
     EXPECT_TENSOR_EQ(out, ret);
     EXPECT_TENSOR_EQ(out, expected);
   }
+
+  template <typename CTYPE, ScalarType DTYPE>
+  void run_complex_smoke_test() {
+    TensorFactory<DTYPE> tf;
+    constexpr auto REAL_DTYPE = executorch::runtime::toRealValueType(DTYPE);
+    TensorFactory<REAL_DTYPE> tf_out;
+    using REAL_CTYPE =
+        typename executorch::runtime::ScalarTypeToCppType<REAL_DTYPE>::type;
+    Tensor in = tf.make(
+        {1, 2},
+        {CTYPE{REAL_CTYPE(3), REAL_CTYPE(4)},
+         CTYPE{REAL_CTYPE(5), REAL_CTYPE(12)}});
+    Tensor out = tf_out.zeros({1, 2});
+    Tensor expected = tf_out.make({1, 2}, {5, 13});
+    Tensor ret = op_abs_out(in, out);
+    EXPECT_TENSOR_EQ(out, ret);
+    EXPECT_TENSOR_CLOSE(out, expected);
+  }
 };
 
 TEST_F(OpAbsTest, SmokeTest) {
@@ -45,6 +63,14 @@ TEST_F(OpAbsTest, SmokeTest) {
   // TODO: cover all REALHBF16 types with generalized unary function test
   // harness.
   ET_FORALL_FLOATHBF16_TYPES(RUN_SMOKE_TEST);
+#undef RUN_SMOKE_TEST
+}
+
+TEST_F(OpAbsTest, ComplexSmokeTest) {
+#define RUN_SMOKE_TEST(ctype, dtype) \
+  run_complex_smoke_test<ctype, ScalarType::dtype>();
+  ET_FORALL_COMPLEXH_TYPES(RUN_SMOKE_TEST);
+#undef RUN_SMOKE_TEST
 }
 
 TEST_F(OpAbsTest, MemoryFormatCheck) {
diff --git a/kernels/test/op_acos_test.cpp b/kernels/test/op_acos_test.cpp
index a95994d507..e41034d0e7 100644
--- a/kernels/test/op_acos_test.cpp
+++ b/kernels/test/op_acos_test.cpp
@@ -13,7 +13,7 @@
 
 #include <cmath>
 
-using exec_aten::Tensor;
+using executorch::aten::Tensor;
 class OpAcosOutTest
     : public torch::executor::testing::UnaryUfuncRealHBBF16ToFloatHBF16Test {
  protected:
diff --git a/kernels/test/op_acosh_test.cpp b/kernels/test/op_acosh_test.cpp
index 99ef815c6c..278e94c360 100644
--- a/kernels/test/op_acosh_test.cpp
+++ b/kernels/test/op_acosh_test.cpp
@@ -13,7 +13,7 @@
 
 #include <cmath>
 
-using exec_aten::Tensor;
+using executorch::aten::Tensor;
 
 class OpAcoshOutTest
     : public torch::executor::testing::UnaryUfuncRealHBBF16ToFloatHBF16Test {
diff --git a/kernels/test/op_add_test.cpp b/kernels/test/op_add_test.cpp
index 0e4e2fc635..f91249a96c 100644
--- a/kernels/test/op_add_test.cpp
+++ b/kernels/test/op_add_test.cpp
@@ -647,13 +647,14 @@ TEST_F(OpAddScalarOutKernelTest, OptimizedSanityCheck) {
 }
 
 TEST_F(OpAddScalarOutKernelTest, DtypeTest_float16_bool_int_float16) {
-  torch::executor::testing::TensorFactory<exec_aten::ScalarType::Half> tfHalf;
-
-  exec_aten::Tensor self = tfHalf.ones({2, 2});
-  exec_aten::Scalar other = exec_aten::Scalar(true);
-  exec_aten::Scalar alpha = exec_aten::Scalar(1);
-  exec_aten::Tensor out = tfHalf.zeros({2, 2});
-  exec_aten::Tensor out_expected = tfHalf.full({2, 2}, 2.0);
+  torch::executor::testing::TensorFactory<executorch::aten::ScalarType::Half>
+      tfHalf;
+
+  executorch::aten::Tensor self = tfHalf.ones({2, 2});
+  executorch::aten::Scalar other = executorch::aten::Scalar(true);
+  executorch::aten::Scalar alpha = executorch::aten::Scalar(1);
+  executorch::aten::Tensor out = tfHalf.zeros({2, 2});
+  executorch::aten::Tensor out_expected = tfHalf.full({2, 2}, 2.0);
   op_add_scalar_out(self, other, alpha, out);
   EXPECT_TENSOR_CLOSE(out, out_expected);
 }
diff --git a/kernels/test/op_addmm_test.cpp b/kernels/test/op_addmm_test.cpp
index b8f33289fc..a2251784c1 100644
--- a/kernels/test/op_addmm_test.cpp
+++ b/kernels/test/op_addmm_test.cpp
@@ -18,10 +18,10 @@
 #include <limits>
 
 using namespace ::testing;
-using exec_aten::ArrayRef;
-using exec_aten::Scalar;
-using exec_aten::ScalarType;
-using exec_aten::Tensor;
+using executorch::aten::ArrayRef;
+using executorch::aten::Scalar;
+using executorch::aten::ScalarType;
+using executorch::aten::Tensor;
 using torch::executor::testing::TensorFactory;
 
 class OpAddmmOutTest : public OperatorTest {
@@ -37,7 +37,7 @@ class OpAddmmOutTest : public OperatorTest {
         context_, self, mat1, mat2, beta, alpha, out);
   }
 
-  template <class CTYPE, exec_aten::ScalarType DTYPE>
+  template <class CTYPE, executorch::aten::ScalarType DTYPE>
   void test_dtype() {
     TensorFactory<DTYPE> tf;
 
diff --git a/kernels/test/op_alias_copy_test.cpp b/kernels/test/op_alias_copy_test.cpp
index e8634a0cb3..4acccad73a 100644
--- a/kernels/test/op_alias_copy_test.cpp
+++ b/kernels/test/op_alias_copy_test.cpp
@@ -16,15 +16,15 @@
 #include <gtest/gtest.h>
 
 using namespace ::testing;
-using exec_aten::ScalarType;
-using exec_aten::Tensor;
+using executorch::aten::ScalarType;
+using executorch::aten::Tensor;
 using torch::executor::testing::TensorFactory;
 
 class OpAliasCopyTest : public OperatorTest {
  protected:
-  exec_aten::Tensor& op_alias_copy_out(
-      const exec_aten::Tensor& self,
-      exec_aten::Tensor& out) {
+  executorch::aten::Tensor& op_alias_copy_out(
+      const executorch::aten::Tensor& self,
+      executorch::aten::Tensor& out) {
     return torch::executor::aten::alias_copy_outf(context_, self, out);
   }
 };
diff --git a/kernels/test/op_amax_test.cpp b/kernels/test/op_amax_test.cpp
index a7794cc24c..703495584c 100644
--- a/kernels/test/op_amax_test.cpp
+++ b/kernels/test/op_amax_test.cpp
@@ -18,9 +18,9 @@
 #include <cmath>
 
 using namespace ::testing;
-using exec_aten::ArrayRef;
-using exec_aten::ScalarType;
-using exec_aten::Tensor;
+using executorch::aten::ArrayRef;
+using executorch::aten::ScalarType;
+using executorch::aten::Tensor;
 using torch::executor::testing::TensorFactory;
 
 class OpAmaxOutTest : public OperatorTest {
diff --git a/kernels/test/op_amin_test.cpp b/kernels/test/op_amin_test.cpp
index 001bb93ca8..dc41676c03 100644
--- a/kernels/test/op_amin_test.cpp
+++ b/kernels/test/op_amin_test.cpp
@@ -18,9 +18,9 @@
 #include <cmath>
 
 using namespace ::testing;
-using exec_aten::ArrayRef;
-using exec_aten::ScalarType;
-using exec_aten::Tensor;
+using executorch::aten::ArrayRef;
+using executorch::aten::ScalarType;
+using executorch::aten::Tensor;
 using torch::executor::testing::TensorFactory;
 
 class OpAminOutTest : public OperatorTest {
diff --git a/kernels/test/op_any_test.cpp b/kernels/test/op_any_test.cpp
index 400602bd2f..072b7ed3c8 100644
--- a/kernels/test/op_any_test.cpp
+++ b/kernels/test/op_any_test.cpp
@@ -17,10 +17,10 @@
 #include <gtest/gtest.h>
 
 using namespace ::testing;
-using exec_aten::ArrayRef;
-using exec_aten::optional;
-using exec_aten::ScalarType;
-using exec_aten::Tensor;
+using executorch::aten::ArrayRef;
+using executorch::aten::optional;
+using executorch::aten::ScalarType;
+using executorch::aten::Tensor;
 using torch::executor::testing::TensorFactory;
 
 class OpAnyOutTest : public OperatorTest {
diff --git a/kernels/test/op_arange_test.cpp b/kernels/test/op_arange_test.cpp
index 2a53b734eb..e7b9ae7c9e 100644
--- a/kernels/test/op_arange_test.cpp
+++ b/kernels/test/op_arange_test.cpp
@@ -20,10 +20,10 @@
 #include <limits>
 
 using namespace ::testing;
-using exec_aten::ArrayRef;
-using exec_aten::Scalar;
-using exec_aten::ScalarType;
-using exec_aten::Tensor;
+using executorch::aten::ArrayRef;
+using executorch::aten::Scalar;
+using executorch::aten::ScalarType;
+using executorch::aten::Tensor;
 
 using torch::executor::testing::TensorFactory;
 
@@ -33,7 +33,7 @@ class OpArangeOutTest : public OperatorTest {
     return torch::executor::aten::arange_outf(context_, end, out);
   }
 
-  template <class CTYPE, exec_aten::ScalarType DTYPE>
+  template <class CTYPE, executorch::aten::ScalarType DTYPE>
   void test_arange_dtype() {
     TensorFactory<DTYPE> tf;
 
@@ -63,7 +63,7 @@ class OpArangeStartOutTest : public OperatorTest {
     return torch::executor::aten::arange_outf(context_, start, end, step, out);
   }
 
-  template <class CTYPE, exec_aten::ScalarType DTYPE>
+  template <class CTYPE, executorch::aten::ScalarType DTYPE>
   void test_arange_start_dtype() {
     TensorFactory<DTYPE> tf;
 
diff --git a/kernels/test/op_argmax_test.cpp b/kernels/test/op_argmax_test.cpp
index 1ed0166552..66c79cefff 100644
--- a/kernels/test/op_argmax_test.cpp
+++ b/kernels/test/op_argmax_test.cpp
@@ -16,10 +16,10 @@
 #include <gtest/gtest.h>
 
 using namespace ::testing;
-using exec_aten::ArrayRef;
-using exec_aten::optional;
-using exec_aten::ScalarType;
-using exec_aten::Tensor;
+using executorch::aten::ArrayRef;
+using executorch::aten::optional;
+using executorch::aten::ScalarType;
+using executorch::aten::Tensor;
 using torch::executor::testing::TensorFactory;
 
 class OpArgmaxTest : public OperatorTest {
diff --git a/kernels/test/op_argmin_test.cpp b/kernels/test/op_argmin_test.cpp
index adc9c4f9d0..250fe4f7e1 100644
--- a/kernels/test/op_argmin_test.cpp
+++ b/kernels/test/op_argmin_test.cpp
@@ -16,10 +16,10 @@
 #include <gtest/gtest.h>
 
 using namespace ::testing;
-using exec_aten::ArrayRef;
-using exec_aten::optional;
-using exec_aten::ScalarType;
-using exec_aten::Tensor;
+using executorch::aten::ArrayRef;
+using executorch::aten::optional;
+using executorch::aten::ScalarType;
+using executorch::aten::Tensor;
 using torch::executor::testing::TensorFactory;
 
 class OpArgminTest : public OperatorTest {
diff --git a/kernels/test/op_as_strided_copy_test.cpp b/kernels/test/op_as_strided_copy_test.cpp
index a5dd9846a5..63fe5a3982 100644
--- a/kernels/test/op_as_strided_copy_test.cpp
+++ b/kernels/test/op_as_strided_copy_test.cpp
@@ -18,11 +18,11 @@
 #include <cstdint>
 
 using namespace ::testing;
-using exec_aten::ArrayRef;
-using exec_aten::optional;
-using exec_aten::Scalar;
-using exec_aten::ScalarType;
-using exec_aten::Tensor;
+using executorch::aten::ArrayRef;
+using executorch::aten::optional;
+using executorch::aten::Scalar;
+using executorch::aten::ScalarType;
+using executorch::aten::Tensor;
 using torch::executor::testing::TensorFactory;
 
 class OpAsStridedCopyOutTest : public OperatorTest {
diff --git a/kernels/test/op_asin_test.cpp b/kernels/test/op_asin_test.cpp
index 65a6a141e2..055e3dacff 100644
--- a/kernels/test/op_asin_test.cpp
+++ b/kernels/test/op_asin_test.cpp
@@ -13,7 +13,7 @@
 
 #include <cmath>
 
-using exec_aten::Tensor;
+using executorch::aten::Tensor;
 class OpAsinOutTest
     : public torch::executor::testing::UnaryUfuncRealHBBF16ToFloatHBF16Test {
  protected:
diff --git a/kernels/test/op_asinh_test.cpp b/kernels/test/op_asinh_test.cpp
index eb7f022995..3bd30f9740 100644
--- a/kernels/test/op_asinh_test.cpp
+++ b/kernels/test/op_asinh_test.cpp
@@ -13,7 +13,7 @@
 
 #include <cmath>
 
-using exec_aten::Tensor;
+using executorch::aten::Tensor;
 class OpAsinhOutTest
     : public torch::executor::testing::UnaryUfuncRealHBBF16ToFloatHBF16Test {
  protected:
diff --git a/kernels/test/op_atan2_test.cpp b/kernels/test/op_atan2_test.cpp
index 2acdeeb913..e69ea0e90c 100644
--- a/kernels/test/op_atan2_test.cpp
+++ b/kernels/test/op_atan2_test.cpp
@@ -16,9 +16,9 @@
 #include <gtest/gtest.h>
 
 using namespace ::testing;
-using exec_aten::Scalar;
-using exec_aten::ScalarType;
-using exec_aten::Tensor;
+using executorch::aten::Scalar;
+using executorch::aten::ScalarType;
+using executorch::aten::Tensor;
 using torch::executor::testing::SupportedFeatures;
 using torch::executor::testing::TensorFactory;
 
diff --git a/kernels/test/op_atan_test.cpp b/kernels/test/op_atan_test.cpp
index 61a1022978..832b82f8a2 100644
--- a/kernels/test/op_atan_test.cpp
+++ b/kernels/test/op_atan_test.cpp
@@ -13,7 +13,7 @@
 
 #include <cmath>
 
-using exec_aten::Tensor;
+using executorch::aten::Tensor;
 class OpAtanOutTest
     : public torch::executor::testing::UnaryUfuncRealHBBF16ToFloatHBF16Test {
  protected:
diff --git a/kernels/test/op_atanh_test.cpp b/kernels/test/op_atanh_test.cpp
index 84270359f4..d60febbbe9 100644
--- a/kernels/test/op_atanh_test.cpp
+++ b/kernels/test/op_atanh_test.cpp
@@ -13,7 +13,7 @@
 
 #include <cmath>
 
-using exec_aten::Tensor;
+using executorch::aten::Tensor;
 class OpAtanhOutTest
     : public torch::executor::testing::UnaryUfuncRealHBBF16ToFloatHBF16Test {
  protected:
diff --git a/kernels/test/op_avg_pool2d_test.cpp b/kernels/test/op_avg_pool2d_test.cpp
index 117836ac2a..9ceedce75d 100644
--- a/kernels/test/op_avg_pool2d_test.cpp
+++ b/kernels/test/op_avg_pool2d_test.cpp
@@ -16,19 +16,19 @@
 #include <gtest/gtest.h>
 
 using namespace ::testing;
-using exec_aten::ScalarType;
+using executorch::aten::ScalarType;
 
 class OpAvgPool2DOutTest : public OperatorTest {
  protected:
-  exec_aten::Tensor& op_avg_pool2d_out(
-      const exec_aten::Tensor& self,
-      exec_aten::ArrayRef<int64_t> kernel_size,
-      exec_aten::ArrayRef<int64_t> stride,
-      exec_aten::ArrayRef<int64_t> padding,
+  executorch::aten::Tensor& op_avg_pool2d_out(
+      const executorch::aten::Tensor& self,
+      executorch::aten::ArrayRef<int64_t> kernel_size,
+      executorch::aten::ArrayRef<int64_t> stride,
+      executorch::aten::ArrayRef<int64_t> padding,
       bool ceil_mode,
       bool count_include_pad,
-      exec_aten::optional<int64_t> divisor_override,
-      exec_aten::Tensor& out) {
+      executorch::aten::optional<int64_t> divisor_override,
+      executorch::aten::Tensor& out) {
     return torch::executor::aten::avg_pool2d_outf(
         context_,
         self,
@@ -45,7 +45,7 @@ class OpAvgPool2DOutTest : public OperatorTest {
   void test_4d_dtype() {
     torch::executor::testing::TensorFactory<DTYPE> tf_dtype;
 
-    exec_aten::Tensor self = tf_dtype.make(
+    executorch::aten::Tensor self = tf_dtype.make(
         {2, 3, 8, 8},
         {-81.875, -37.25,  -99.75,  -60.5,   -1.375,  25.625,  -54.25,
          -95.875, 48.0,    26.125,  -14.625, -5.0,    98.25,   1.75,
@@ -103,19 +103,22 @@ class OpAvgPool2DOutTest : public OperatorTest {
          96.75,   -66.75,  48.375,  -79.25,  8.0,     -14.25,  -8.0,
          51.75,   28.375,  32.0,    -50.875, 53.0,    -81.75});
     ::std::vector<int64_t> kernel_size_vec = {2, 3};
-    exec_aten::ArrayRef<int64_t> kernel_size = exec_aten::ArrayRef<int64_t>(
-        kernel_size_vec.data(), kernel_size_vec.size());
+    executorch::aten::ArrayRef<int64_t> kernel_size =
+        executorch::aten::ArrayRef<int64_t>(
+            kernel_size_vec.data(), kernel_size_vec.size());
     ::std::vector<int64_t> stride_vec = {3, 2};
-    exec_aten::ArrayRef<int64_t> stride =
-        exec_aten::ArrayRef<int64_t>(stride_vec.data(), stride_vec.size());
+    executorch::aten::ArrayRef<int64_t> stride =
+        executorch::aten::ArrayRef<int64_t>(
+            stride_vec.data(), stride_vec.size());
     ::std::vector<int64_t> padding_vec = {1, 1};
-    exec_aten::ArrayRef<int64_t> padding =
-        exec_aten::ArrayRef<int64_t>(padding_vec.data(), padding_vec.size());
+    executorch::aten::ArrayRef<int64_t> padding =
+        executorch::aten::ArrayRef<int64_t>(
+            padding_vec.data(), padding_vec.size());
     bool ceil_mode = false;
     bool count_include_pad = true;
-    exec_aten::optional<int64_t> divisor_override;
-    exec_aten::Tensor out = tf_dtype.zeros({2, 3, 3, 4});
-    exec_aten::Tensor out_expected = tf_dtype.make(
+    executorch::aten::optional<int64_t> divisor_override;
+    executorch::aten::Tensor out = tf_dtype.zeros({2, 3, 3, 4});
+    executorch::aten::Tensor out_expected = tf_dtype.make(
         {2, 3, 3, 4},
         {-19.85416603088379,
          -32.91666793823242,
@@ -215,7 +218,7 @@ class OpAvgPool2DOutTest : public OperatorTest {
   void test_4d_divisor_override_dtype() {
     torch::executor::testing::TensorFactory<ScalarType::Float> tfFloat;
 
-    exec_aten::Tensor self = tfFloat.make(
+    executorch::aten::Tensor self = tfFloat.make(
         {2, 3, 8, 8},
         {13.25,   87.125,  -61.875, 52.875,  -74.5,   37.5,    -62.125, 25.375,
          -34.375, 68.5,    87.875,  91.125,  -22.75,  96.875,  79.25,   38.125,
@@ -266,20 +269,23 @@ class OpAvgPool2DOutTest : public OperatorTest {
          -58.125, -50.75,  -50.875, 8.375,   35.625,  -72.5,   -76.125, -33.25,
          -18.75,  -71.0,   76.625,  -11.25,  -3.0,    -38.625, -66.375, -25.0});
     ::std::vector<int64_t> kernel_size_vec = {2, 3};
-    exec_aten::ArrayRef<int64_t> kernel_size = exec_aten::ArrayRef<int64_t>(
-        kernel_size_vec.data(), kernel_size_vec.size());
+    executorch::aten::ArrayRef<int64_t> kernel_size =
+        executorch::aten::ArrayRef<int64_t>(
+            kernel_size_vec.data(), kernel_size_vec.size());
     ::std::vector<int64_t> stride_vec = {3, 2};
-    exec_aten::ArrayRef<int64_t> stride =
-        exec_aten::ArrayRef<int64_t>(stride_vec.data(), stride_vec.size());
+    executorch::aten::ArrayRef<int64_t> stride =
+        executorch::aten::ArrayRef<int64_t>(
+            stride_vec.data(), stride_vec.size());
     ::std::vector<int64_t> padding_vec = {1, 1};
-    exec_aten::ArrayRef<int64_t> padding =
-        exec_aten::ArrayRef<int64_t>(padding_vec.data(), padding_vec.size());
+    executorch::aten::ArrayRef<int64_t> padding =
+        executorch::aten::ArrayRef<int64_t>(
+            padding_vec.data(), padding_vec.size());
     bool ceil_mode = false;
     bool count_include_pad = true;
-    exec_aten::optional<int64_t> divisor_override =
-        exec_aten::optional<int64_t>(10);
-    exec_aten::Tensor out = tfFloat.zeros({2, 3, 3, 4});
-    exec_aten::Tensor out_expected = tfFloat.make(
+    executorch::aten::optional<int64_t> divisor_override =
+        executorch::aten::optional<int64_t>(10);
+    executorch::aten::Tensor out = tfFloat.zeros({2, 3, 3, 4});
+    executorch::aten::Tensor out_expected = tfFloat.make(
         {2, 3, 3, 4},
         {10.037500381469727,
          7.8125,
@@ -369,7 +375,7 @@ class OpAvgPool2DOutTest : public OperatorTest {
   void test_4d_ceil_mode_no_include_padding_dtype() {
     torch::executor::testing::TensorFactory<ScalarType::Float> tfFloat;
 
-    exec_aten::Tensor self = tfFloat.make(
+    executorch::aten::Tensor self = tfFloat.make(
         {2, 3, 14, 12},
         {26.375,  -17.0,   63.5,    83.0,    21.375,  -46.5,   -69.125,
          99.875,  -67.125, -76.0,   -1.125,  -2.625,  -48.0,   -1.5,
@@ -516,19 +522,22 @@ class OpAvgPool2DOutTest : public OperatorTest {
          -17.25,  -3.125,  34.25,   -54.125, -93.125, 65.0,    -76.375,
          -20.625, -77.875, -65.625, -79.875, 28.75,   58.25,   -25.25});
     ::std::vector<int64_t> kernel_size_vec = {4, 2};
-    exec_aten::ArrayRef<int64_t> kernel_size = exec_aten::ArrayRef<int64_t>(
-        kernel_size_vec.data(), kernel_size_vec.size());
+    executorch::aten::ArrayRef<int64_t> kernel_size =
+        executorch::aten::ArrayRef<int64_t>(
+            kernel_size_vec.data(), kernel_size_vec.size());
     ::std::vector<int64_t> stride_vec = {1, 2};
-    exec_aten::ArrayRef<int64_t> stride =
-        exec_aten::ArrayRef<int64_t>(stride_vec.data(), stride_vec.size());
+    executorch::aten::ArrayRef<int64_t> stride =
+        executorch::aten::ArrayRef<int64_t>(
+            stride_vec.data(), stride_vec.size());
     ::std::vector<int64_t> padding_vec = {1, 1};
-    exec_aten::ArrayRef<int64_t> padding =
-        exec_aten::ArrayRef<int64_t>(padding_vec.data(), padding_vec.size());
+    executorch::aten::ArrayRef<int64_t> padding =
+        executorch::aten::ArrayRef<int64_t>(
+            padding_vec.data(), padding_vec.size());
     bool ceil_mode = true;
     bool count_include_pad = false;
-    exec_aten::optional<int64_t> divisor_override;
-    exec_aten::Tensor out = tfFloat.zeros({2, 3, 13, 7});
-    exec_aten::Tensor out_expected = tfFloat.make(
+    executorch::aten::optional<int64_t> divisor_override;
+    executorch::aten::Tensor out = tfFloat.zeros({2, 3, 13, 7});
+    executorch::aten::Tensor out_expected = tfFloat.make(
         {2, 3, 13, 7},
         {-9.166666984558105,
          -8.583333015441895,
diff --git a/kernels/test/op_bitwise_and_test.cpp b/kernels/test/op_bitwise_and_test.cpp
index be9f69e32f..6301841ff1 100644
--- a/kernels/test/op_bitwise_and_test.cpp
+++ b/kernels/test/op_bitwise_and_test.cpp
@@ -15,9 +15,9 @@
 #include <gtest/gtest.h>
 
 using namespace ::testing;
-using exec_aten::Scalar;
-using exec_aten::ScalarType;
-using exec_aten::Tensor;
+using executorch::aten::Scalar;
+using executorch::aten::ScalarType;
+using executorch::aten::Tensor;
 using torch::executor::testing::TensorFactory;
 
 class OpBitwiseAndTensorOutTest : public OperatorTest {
diff --git a/kernels/test/op_bitwise_not_test.cpp b/kernels/test/op_bitwise_not_test.cpp
index 10f45f7f50..1b73574f9f 100644
--- a/kernels/test/op_bitwise_not_test.cpp
+++ b/kernels/test/op_bitwise_not_test.cpp
@@ -16,9 +16,9 @@
 #include <gtest/gtest.h>
 
 using namespace ::testing;
-using exec_aten::Scalar;
-using exec_aten::ScalarType;
-using exec_aten::Tensor;
+using executorch::aten::Scalar;
+using executorch::aten::ScalarType;
+using executorch::aten::Tensor;
 using torch::executor::testing::TensorFactory;
 
 class OpBitwiseNotOutTest : public OperatorTest {
diff --git a/kernels/test/op_bitwise_or_test.cpp b/kernels/test/op_bitwise_or_test.cpp
index f2c52ff7a7..96b2fad28a 100644
--- a/kernels/test/op_bitwise_or_test.cpp
+++ b/kernels/test/op_bitwise_or_test.cpp
@@ -15,9 +15,9 @@
 #include <gtest/gtest.h>
 
 using namespace ::testing;
-using exec_aten::Scalar;
-using exec_aten::ScalarType;
-using exec_aten::Tensor;
+using executorch::aten::Scalar;
+using executorch::aten::ScalarType;
+using executorch::aten::Tensor;
 using torch::executor::testing::TensorFactory;
 
 class OpBitwiseOrTensorOutTest : public OperatorTest {
diff --git a/kernels/test/op_bitwise_xor_test.cpp b/kernels/test/op_bitwise_xor_test.cpp
index 5da260dc74..6a14d3c7fc 100644
--- a/kernels/test/op_bitwise_xor_test.cpp
+++ b/kernels/test/op_bitwise_xor_test.cpp
@@ -15,9 +15,9 @@
 #include <gtest/gtest.h>
 
 using namespace ::testing;
-using exec_aten::Scalar;
-using exec_aten::ScalarType;
-using exec_aten::Tensor;
+using executorch::aten::Scalar;
+using executorch::aten::ScalarType;
+using executorch::aten::Tensor;
 using torch::executor::testing::TensorFactory;
 
 class OpBitwiseXorTensorOutTest : public OperatorTest {
diff --git a/kernels/test/op_bmm_test.cpp b/kernels/test/op_bmm_test.cpp
index ae60cba66b..88671467f4 100644
--- a/kernels/test/op_bmm_test.cpp
+++ b/kernels/test/op_bmm_test.cpp
@@ -17,9 +17,9 @@
 #include <gtest/gtest.h>
 
 using namespace ::testing;
-using exec_aten::ArrayRef;
-using exec_aten::ScalarType;
-using exec_aten::Tensor;
+using executorch::aten::ArrayRef;
+using executorch::aten::ScalarType;
+using executorch::aten::Tensor;
 using torch::executor::testing::TensorFactory;
 
 class OpBmmOutTest : public OperatorTest {
@@ -28,7 +28,7 @@ class OpBmmOutTest : public OperatorTest {
     return torch::executor::aten::bmm_outf(context_, self, mat2, out);
   }
 
-  template <class CTYPE, exec_aten::ScalarType DTYPE>
+  template <class CTYPE, executorch::aten::ScalarType DTYPE>
   void test_dtype() {
     TensorFactory<DTYPE> tf;
 
diff --git a/kernels/test/op_cat_test.cpp b/kernels/test/op_cat_test.cpp
index cf11e32db9..9bdccb13a3 100644
--- a/kernels/test/op_cat_test.cpp
+++ b/kernels/test/op_cat_test.cpp
@@ -17,10 +17,10 @@
 #include <gtest/gtest.h>
 
 using namespace ::testing;
-using exec_aten::ArrayRef;
-using exec_aten::ScalarType;
-using exec_aten::Tensor;
-using exec_aten::TensorList;
+using executorch::aten::ArrayRef;
+using executorch::aten::ScalarType;
+using executorch::aten::Tensor;
+using executorch::aten::TensorList;
 using torch::executor::testing::TensorFactory;
 
 class OpCatOutTest : public OperatorTest {
@@ -29,7 +29,7 @@ class OpCatOutTest : public OperatorTest {
     return torch::executor::aten::cat_outf(context_, tensors, dim, out);
   }
 
-  template <class CTYPE, exec_aten::ScalarType DTYPE>
+  template <class CTYPE, executorch::aten::ScalarType DTYPE>
   void test_dtype() {
     TensorFactory<DTYPE> tf;
 
diff --git a/kernels/test/op_cdist_forward_test.cpp b/kernels/test/op_cdist_forward_test.cpp
index c674f1b536..32465ca439 100644
--- a/kernels/test/op_cdist_forward_test.cpp
+++ b/kernels/test/op_cdist_forward_test.cpp
@@ -17,10 +17,10 @@
 #include <gtest/gtest.h>
 
 using namespace ::testing;
-using exec_aten::ArrayRef;
-using exec_aten::optional;
-using exec_aten::ScalarType;
-using exec_aten::Tensor;
+using executorch::aten::ArrayRef;
+using executorch::aten::optional;
+using executorch::aten::ScalarType;
+using executorch::aten::Tensor;
 using torch::executor::testing::TensorFactory;
 
 Tensor& op_cdist_forward_out(
diff --git a/kernels/test/op_ceil_test.cpp b/kernels/test/op_ceil_test.cpp
index 54b7787bd4..8a5b88a50a 100644
--- a/kernels/test/op_ceil_test.cpp
+++ b/kernels/test/op_ceil_test.cpp
@@ -16,8 +16,8 @@
 #include <gtest/gtest.h>
 
 using namespace ::testing;
-using exec_aten::ScalarType;
-using exec_aten::Tensor;
+using executorch::aten::ScalarType;
+using executorch::aten::Tensor;
 using torch::executor::testing::TensorFactory;
 
 class OpCeilTest : public OperatorTest {
diff --git a/kernels/test/op_clamp_test.cpp b/kernels/test/op_clamp_test.cpp
index 533dfee7ae..a1003e892e 100644
--- a/kernels/test/op_clamp_test.cpp
+++ b/kernels/test/op_clamp_test.cpp
@@ -21,15 +21,15 @@
 #include <gtest/gtest.h>
 
 using namespace ::testing;
-using exec_aten::ArrayRef;
-using exec_aten::nullopt;
-using exec_aten::optional;
-using exec_aten::Scalar;
-using exec_aten::ScalarType;
-using exec_aten::Tensor;
+using executorch::aten::ArrayRef;
+using executorch::aten::nullopt;
+using executorch::aten::optional;
+using executorch::aten::Scalar;
+using executorch::aten::ScalarType;
+using executorch::aten::Tensor;
 using torch::executor::testing::TensorFactory;
 
-using OptScalar = exec_aten::optional<Scalar>;
+using OptScalar = executorch::aten::optional<Scalar>;
 
 class OpClampOutTest : public OperatorTest {
  protected:
@@ -149,7 +149,7 @@ class OpClampOutTest : public OperatorTest {
   void run_floating_point_test_cases() {
     using ctype = typename TensorFactory<DTYPE>::ctype;
     using opt_infinity_type = std::conditional_t<
-        std::is_same<ctype, exec_aten::Half>::value,
+        std::is_same<ctype, executorch::aten::Half>::value,
         float,
         ctype>;
     constexpr auto kInfinity = std::numeric_limits<ctype>::infinity();
diff --git a/kernels/test/op_clone_test.cpp b/kernels/test/op_clone_test.cpp
index 62f8c499f2..fef61590f9 100644
--- a/kernels/test/op_clone_test.cpp
+++ b/kernels/test/op_clone_test.cpp
@@ -16,10 +16,10 @@
 #include <gtest/gtest.h>
 
 using namespace ::testing;
-using exec_aten::MemoryFormat;
-using exec_aten::optional;
-using exec_aten::ScalarType;
-using exec_aten::Tensor;
+using executorch::aten::MemoryFormat;
+using executorch::aten::optional;
+using executorch::aten::ScalarType;
+using executorch::aten::Tensor;
 using torch::executor::testing::TensorFactory;
 
 class OpCloneTest : public OperatorTest {
@@ -33,7 +33,7 @@ class OpCloneTest : public OperatorTest {
   }
 
   // test if clone.out works well under all kinds of legal input type.
-  template <class CTYPE, exec_aten::ScalarType DTYPE>
+  template <class CTYPE, executorch::aten::ScalarType DTYPE>
   void test_dtype() {
     TensorFactory<DTYPE> tf;
     Tensor input = tf.make(/*sizes=*/{2, 4}, /*data=*/{2, 3, 2, 4, 1, 5, 1, 6});
@@ -44,11 +44,11 @@ class OpCloneTest : public OperatorTest {
     // nullopt or MemoryFormat::Contiguous.
     Tensor out_nullopt_ret = op_clone_out(
         /*self=*/input,
-        /*memory_format=*/exec_aten::nullopt,
+        /*memory_format=*/executorch::aten::nullopt,
         /*out=*/out_nullopt);
     Tensor out_contiguous_ret = op_clone_out(
         /*self=*/input,
-        /*memory_format=*/exec_aten::MemoryFormat::Contiguous,
+        /*memory_format=*/executorch::aten::MemoryFormat::Contiguous,
         /*out=*/out_contiguous);
 
     // The original tensor a should share same value with the out variable and
@@ -65,7 +65,7 @@ class OpCloneTest : public OperatorTest {
     TensorFactory<DTYPE> tf;
     Tensor input = tf.make(/*sizes=*/{3, 0, 1, 2}, /*data=*/{});
     Tensor out = tf.zeros({3, 0, 1, 2});
-    op_clone_out(input, /*memory_format=*/exec_aten::nullopt, out);
+    op_clone_out(input, /*memory_format=*/executorch::aten::nullopt, out);
     // check a and out share same value, but are different object
     EXPECT_TENSOR_EQ(input, out);
   }
@@ -95,7 +95,8 @@ TEST_F(OpCloneTest, MismatchedSizesDie) {
   Tensor input = tf.make(/*sizes=*/{3, 1, 1, 2}, /*data=*/{1, 2, 3, 4, 5, 6});
   Tensor out = tf.zeros({3, 2, 1, 1});
   ET_EXPECT_KERNEL_FAILURE(
-      context_, op_clone_out(input, /*memory_format=*/exec_aten::nullopt, out));
+      context_,
+      op_clone_out(input, /*memory_format=*/executorch::aten::nullopt, out));
 }
 
 TEST_F(OpCloneTest, MismatchedTypesDie) {
@@ -105,7 +106,8 @@ TEST_F(OpCloneTest, MismatchedTypesDie) {
       tf_in.make(/*sizes=*/{3, 1, 1, 2}, /*data=*/{1, 2, 3, 4, 5, 6});
   Tensor out = tf_out.zeros({3, 1, 1, 2});
   ET_EXPECT_KERNEL_FAILURE(
-      context_, op_clone_out(input, /*memory_format=*/exec_aten::nullopt, out));
+      context_,
+      op_clone_out(input, /*memory_format=*/executorch::aten::nullopt, out));
 }
 
 // Only contiguous memory is supported, the memory type other than nullopt or
@@ -122,7 +124,8 @@ TEST_F(OpCloneTest, MismatchedMemoryFormatDie) {
   Tensor out = tf_out.zeros({3, 1, 1, 2});
   ET_EXPECT_KERNEL_FAILURE(
       context_,
-      op_clone_out(input, static_cast<exec_aten::MemoryFormat>(55), out));
+      op_clone_out(
+          input, static_cast<executorch::aten::MemoryFormat>(55), out));
 }
 
 TEST_F(OpCloneTest, SimpleGeneratedCase) {
@@ -150,7 +153,7 @@ TEST_F(OpCloneTest, SimpleGeneratedCase) {
        1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0});
 
   Tensor out = tf.zeros({10, 10});
-  Tensor ret = op_clone_out(x, exec_aten::MemoryFormat::Contiguous, out);
+  Tensor ret = op_clone_out(x, executorch::aten::MemoryFormat::Contiguous, out);
   EXPECT_TENSOR_CLOSE(out, expected_result);
 }
 
@@ -176,7 +179,7 @@ TEST_F(OpCloneTest, DynamicShapeUpperBoundSameAsExpected) {
 
   Tensor out =
       tf.zeros({3, 2}, torch::executor::TensorShapeDynamism::DYNAMIC_BOUND);
-  Tensor ret = op_clone_out(x, exec_aten::MemoryFormat::Contiguous, out);
+  Tensor ret = op_clone_out(x, executorch::aten::MemoryFormat::Contiguous, out);
   EXPECT_TENSOR_CLOSE(out, expected_result);
 }
 
@@ -202,7 +205,7 @@ TEST_F(OpCloneTest, DynamicShapeUpperBoundLargerThanExpected) {
 
   Tensor out =
       tf.zeros({10, 10}, torch::executor::TensorShapeDynamism::DYNAMIC_BOUND);
-  Tensor ret = op_clone_out(x, exec_aten::MemoryFormat::Contiguous, out);
+  Tensor ret = op_clone_out(x, executorch::aten::MemoryFormat::Contiguous, out);
   EXPECT_TENSOR_CLOSE(out, expected_result);
 }
 
@@ -229,6 +232,6 @@ TEST_F(OpCloneTest, DynamicShapeUnbound) {
 
   Tensor out =
       tf.zeros({1, 1}, torch::executor::TensorShapeDynamism::DYNAMIC_UNBOUND);
-  Tensor ret = op_clone_out(x, exec_aten::MemoryFormat::Contiguous, out);
+  Tensor ret = op_clone_out(x, executorch::aten::MemoryFormat::Contiguous, out);
   EXPECT_TENSOR_CLOSE(out, expected_result);
 }
diff --git a/kernels/test/op_constant_pad_nd_test.cpp b/kernels/test/op_constant_pad_nd_test.cpp
index 8d5befb108..88bee1d0ad 100644
--- a/kernels/test/op_constant_pad_nd_test.cpp
+++ b/kernels/test/op_constant_pad_nd_test.cpp
@@ -16,10 +16,10 @@
 #include <gtest/gtest.h>
 
 using namespace ::testing;
-using exec_aten::IntArrayRef;
-using exec_aten::Scalar;
-using exec_aten::ScalarType;
-using exec_aten::Tensor;
+using executorch::aten::IntArrayRef;
+using executorch::aten::Scalar;
+using executorch::aten::ScalarType;
+using executorch::aten::Tensor;
 using torch::executor::testing::SupportedFeatures;
 using torch::executor::testing::TensorFactory;
 
diff --git a/kernels/test/op_convolution_backward_test.cpp b/kernels/test/op_convolution_backward_test.cpp
index bcef6e7740..4d681754c3 100644
--- a/kernels/test/op_convolution_backward_test.cpp
+++ b/kernels/test/op_convolution_backward_test.cpp
@@ -15,12 +15,12 @@
 #include <gtest/gtest.h>
 
 using namespace ::testing;
-using exec_aten::ArrayRef;
-using exec_aten::optional;
-using exec_aten::ScalarType;
-using exec_aten::Tensor;
-using IntArrayRef = exec_aten::ArrayRef<int64_t>;
-using OptIntArrayRef = exec_aten::OptionalArrayRef<int64_t>;
+using executorch::aten::ArrayRef;
+using executorch::aten::optional;
+using executorch::aten::ScalarType;
+using executorch::aten::Tensor;
+using IntArrayRef = executorch::aten::ArrayRef<int64_t>;
+using OptIntArrayRef = executorch::aten::OptionalArrayRef<int64_t>;
 using torch::executor::testing::TensorFactory;
 
 class OpConvolutionBackwardOutTest : public OperatorTest {
diff --git a/kernels/test/op_convolution_test.cpp b/kernels/test/op_convolution_test.cpp
index 714370d8de..2929d19e8c 100644
--- a/kernels/test/op_convolution_test.cpp
+++ b/kernels/test/op_convolution_test.cpp
@@ -16,10 +16,10 @@
 #include <gtest/gtest.h>
 
 using namespace ::testing;
-using exec_aten::ArrayRef;
-using exec_aten::optional;
-using exec_aten::ScalarType;
-using exec_aten::Tensor;
+using executorch::aten::ArrayRef;
+using executorch::aten::optional;
+using executorch::aten::ScalarType;
+using executorch::aten::Tensor;
 using torch::executor::testing::TensorFactory;
 
 class OpConvOutTest : public OperatorTest {
@@ -164,12 +164,12 @@ TEST_F(OpConvCorrectnessTest, GenericSmokeTest) {
   op_convolution_out(
       input,
       weight,
-      exec_aten::optional<Tensor>(bias),
-      exec_aten::ArrayRef<int64_t>{stride, 1},
-      exec_aten::ArrayRef<int64_t>{padding, 1},
-      exec_aten::ArrayRef<int64_t>{dilation, 1},
+      executorch::aten::optional<Tensor>(bias),
+      executorch::aten::ArrayRef<int64_t>{stride, 1},
+      executorch::aten::ArrayRef<int64_t>{padding, 1},
+      executorch::aten::ArrayRef<int64_t>{dilation, 1},
       false,
-      exec_aten::ArrayRef<int64_t>{output_padding, 1},
+      executorch::aten::ArrayRef<int64_t>{output_padding, 1},
       int64_t(1),
       out);
   EXPECT_TENSOR_CLOSE(out, expected);
@@ -492,12 +492,12 @@ TEST_F(OpConvCorrectnessTest, InvalidInputShape) {
       op_convolution_out(
           input,
           weight,
-          exec_aten::optional<Tensor>(bias),
-          exec_aten::ArrayRef<int64_t>{stride, 1},
-          exec_aten::ArrayRef<int64_t>{padding, 1},
-          exec_aten::ArrayRef<int64_t>{dilation, 1},
+          executorch::aten::optional<Tensor>(bias),
+          executorch::aten::ArrayRef<int64_t>{stride, 1},
+          executorch::aten::ArrayRef<int64_t>{padding, 1},
+          executorch::aten::ArrayRef<int64_t>{dilation, 1},
           false,
-          exec_aten::ArrayRef<int64_t>{output_padding, 1},
+          executorch::aten::ArrayRef<int64_t>{output_padding, 1},
           groups,
           out));
 
@@ -506,12 +506,12 @@ TEST_F(OpConvCorrectnessTest, InvalidInputShape) {
       op_convolution_out(
           input,
           weight,
-          exec_aten::optional<Tensor>(bias),
-          exec_aten::ArrayRef<int64_t>{stride, 1},
-          exec_aten::ArrayRef<int64_t>{padding, 1},
-          exec_aten::ArrayRef<int64_t>{dilation, 1},
+          executorch::aten::optional<Tensor>(bias),
+          executorch::aten::ArrayRef<int64_t>{stride, 1},
+          executorch::aten::ArrayRef<int64_t>{padding, 1},
+          executorch::aten::ArrayRef<int64_t>{dilation, 1},
           true,
-          exec_aten::ArrayRef<int64_t>{output_padding, 1},
+          executorch::aten::ArrayRef<int64_t>{output_padding, 1},
           groups,
           out));
 }
@@ -538,12 +538,12 @@ TEST_F(OpConvCorrectnessTest, TransposedDefaultParams) {
   op_convolution_out(
       input,
       weight,
-      exec_aten::optional<Tensor>(bias),
-      exec_aten::ArrayRef<int64_t>{stride, 1},
-      exec_aten::ArrayRef<int64_t>{padding, 1},
-      exec_aten::ArrayRef<int64_t>{dilation, 1},
+      executorch::aten::optional<Tensor>(bias),
+      executorch::aten::ArrayRef<int64_t>{stride, 1},
+      executorch::aten::ArrayRef<int64_t>{padding, 1},
+      executorch::aten::ArrayRef<int64_t>{dilation, 1},
       transposed,
-      exec_aten::ArrayRef<int64_t>{output_padding, 1},
+      executorch::aten::ArrayRef<int64_t>{output_padding, 1},
       groups,
       out);
 
@@ -575,12 +575,12 @@ TEST_F(OpConvCorrectnessTest, TransposedNonDefaultParams) {
   op_convolution_out(
       input,
       weight,
-      exec_aten::optional<Tensor>(bias),
-      exec_aten::ArrayRef<int64_t>{stride, 1},
-      exec_aten::ArrayRef<int64_t>{padding, 1},
-      exec_aten::ArrayRef<int64_t>{dilation, 1},
+      executorch::aten::optional<Tensor>(bias),
+      executorch::aten::ArrayRef<int64_t>{stride, 1},
+      executorch::aten::ArrayRef<int64_t>{padding, 1},
+      executorch::aten::ArrayRef<int64_t>{dilation, 1},
       transposed,
-      exec_aten::ArrayRef<int64_t>{output_padding, 1},
+      executorch::aten::ArrayRef<int64_t>{output_padding, 1},
       groups,
       out);
 
@@ -643,12 +643,12 @@ TEST_F(OpConvCorrectnessTest, TransposedDefaultParamsChannelsLast) {
   op_convolution_out(
       input,
       weight,
-      exec_aten::optional<Tensor>(bias),
-      exec_aten::ArrayRef<int64_t>{stride, 1},
-      exec_aten::ArrayRef<int64_t>{padding, 1},
-      exec_aten::ArrayRef<int64_t>{dilation, 1},
+      executorch::aten::optional<Tensor>(bias),
+      executorch::aten::ArrayRef<int64_t>{stride, 1},
+      executorch::aten::ArrayRef<int64_t>{padding, 1},
+      executorch::aten::ArrayRef<int64_t>{dilation, 1},
       transposed,
-      exec_aten::ArrayRef<int64_t>{output_padding, 1},
+      executorch::aten::ArrayRef<int64_t>{output_padding, 1},
       groups,
       out);
 
@@ -687,12 +687,12 @@ TEST_F(OpConvCorrectnessTest, TransposedNonDefaultParamsChannelsLast) {
   op_convolution_out(
       input,
       weight,
-      exec_aten::optional<Tensor>(bias),
-      exec_aten::ArrayRef<int64_t>{stride, 1},
-      exec_aten::ArrayRef<int64_t>{padding, 1},
-      exec_aten::ArrayRef<int64_t>{dilation, 1},
+      executorch::aten::optional<Tensor>(bias),
+      executorch::aten::ArrayRef<int64_t>{stride, 1},
+      executorch::aten::ArrayRef<int64_t>{padding, 1},
+      executorch::aten::ArrayRef<int64_t>{dilation, 1},
       transposed,
-      exec_aten::ArrayRef<int64_t>{output_padding, 1},
+      executorch::aten::ArrayRef<int64_t>{output_padding, 1},
       groups,
       out);
 
@@ -719,12 +719,12 @@ TEST_F(OpConvCorrectnessTest, InvalidOutputPadding) {
       op_convolution_out(
           input,
           weight,
-          exec_aten::optional<Tensor>(bias),
-          exec_aten::ArrayRef<int64_t>{stride, 1},
-          exec_aten::ArrayRef<int64_t>{padding, 1},
-          exec_aten::ArrayRef<int64_t>{dilation, 1},
+          executorch::aten::optional<Tensor>(bias),
+          executorch::aten::ArrayRef<int64_t>{stride, 1},
+          executorch::aten::ArrayRef<int64_t>{padding, 1},
+          executorch::aten::ArrayRef<int64_t>{dilation, 1},
           transposed,
-          exec_aten::ArrayRef<int64_t>{output_padding, 1},
+          executorch::aten::ArrayRef<int64_t>{output_padding, 1},
           groups,
           out));
 }
diff --git a/kernels/test/op_copy_test.cpp b/kernels/test/op_copy_test.cpp
index 007b10a763..5ef58b571b 100644
--- a/kernels/test/op_copy_test.cpp
+++ b/kernels/test/op_copy_test.cpp
@@ -16,10 +16,10 @@
 #include <gtest/gtest.h>
 
 using namespace ::testing;
-using exec_aten::MemoryFormat;
-using exec_aten::optional;
-using exec_aten::ScalarType;
-using exec_aten::Tensor;
+using executorch::aten::MemoryFormat;
+using executorch::aten::optional;
+using executorch::aten::ScalarType;
+using executorch::aten::Tensor;
 using torch::executor::testing::TensorFactory;
 
 class OpCopyTest : public OperatorTest {
@@ -34,7 +34,7 @@ class OpCopyTest : public OperatorTest {
   }
 
   // test if copy.out works well under all kinds of legal input type.
-  template <class CTYPE, exec_aten::ScalarType DTYPE>
+  template <class CTYPE, executorch::aten::ScalarType DTYPE>
   void test_dtype() {
     TensorFactory<DTYPE> tf;
     Tensor self = tf.make(/*sizes=*/{2, 4}, /*data=*/{2, 3, 2, 4, 1, 5, 1, 6});
diff --git a/kernels/test/op_cos_test.cpp b/kernels/test/op_cos_test.cpp
index 8fd88d9082..7fa52b8168 100644
--- a/kernels/test/op_cos_test.cpp
+++ b/kernels/test/op_cos_test.cpp
@@ -13,7 +13,7 @@
 
 #include <cmath>
 
-using exec_aten::Tensor;
+using executorch::aten::Tensor;
 class OpCosOutTest
     : public torch::executor::testing::UnaryUfuncRealHBBF16ToFloatHBF16Test {
  protected:
diff --git a/kernels/test/op_cosh_test.cpp b/kernels/test/op_cosh_test.cpp
index db4c3d221e..7cf0a8974b 100644
--- a/kernels/test/op_cosh_test.cpp
+++ b/kernels/test/op_cosh_test.cpp
@@ -13,7 +13,7 @@
 
 #include <cmath>
 
-using exec_aten::Tensor;
+using executorch::aten::Tensor;
 class OpCoshOutTest
     : public torch::executor::testing::UnaryUfuncRealHBBF16ToFloatHBF16Test {
  protected:
diff --git a/kernels/test/op_cumsum_test.cpp b/kernels/test/op_cumsum_test.cpp
index 47dce0c93a..094acb761c 100644
--- a/kernels/test/op_cumsum_test.cpp
+++ b/kernels/test/op_cumsum_test.cpp
@@ -17,10 +17,10 @@
 #include <cmath>
 
 using namespace ::testing;
-using exec_aten::ArrayRef;
-using exec_aten::optional;
-using exec_aten::ScalarType;
-using exec_aten::Tensor;
+using executorch::aten::ArrayRef;
+using executorch::aten::optional;
+using executorch::aten::ScalarType;
+using executorch::aten::Tensor;
 using torch::executor::testing::TensorFactory;
 
 class OpCumSumOutTest : public OperatorTest {
diff --git a/kernels/test/op_detach_copy_test.cpp b/kernels/test/op_detach_copy_test.cpp
index 8bbeeaaf9e..d5c558afd9 100644
--- a/kernels/test/op_detach_copy_test.cpp
+++ b/kernels/test/op_detach_copy_test.cpp
@@ -17,9 +17,9 @@
 #include <cmath>
 
 using namespace ::testing;
-using exec_aten::Scalar;
-using exec_aten::ScalarType;
-using exec_aten::Tensor;
+using executorch::aten::Scalar;
+using executorch::aten::ScalarType;
+using executorch::aten::Tensor;
 using torch::executor::testing::TensorFactory;
 
 class OpDetachCopyOutTest : public OperatorTest {
diff --git a/kernels/test/op_diagonal_copy_test.cpp b/kernels/test/op_diagonal_copy_test.cpp
index a878edd2e4..5f20aadd09 100644
--- a/kernels/test/op_diagonal_copy_test.cpp
+++ b/kernels/test/op_diagonal_copy_test.cpp
@@ -16,9 +16,9 @@
 #include <gtest/gtest.h>
 
 using namespace ::testing;
-using exec_aten::IntArrayRef;
-using exec_aten::ScalarType;
-using exec_aten::Tensor;
+using executorch::aten::IntArrayRef;
+using executorch::aten::ScalarType;
+using executorch::aten::Tensor;
 using torch::executor::testing::TensorFactory;
 
 Tensor& op_diagonal_copy_out(
diff --git a/kernels/test/op_div_test.cpp b/kernels/test/op_div_test.cpp
index df7fdaf2ec..97d538971c 100644
--- a/kernels/test/op_div_test.cpp
+++ b/kernels/test/op_div_test.cpp
@@ -17,9 +17,9 @@
 #include <cmath>
 
 using namespace ::testing;
-using exec_aten::Scalar;
-using exec_aten::ScalarType;
-using exec_aten::Tensor;
+using executorch::aten::Scalar;
+using executorch::aten::ScalarType;
+using executorch::aten::Tensor;
 using torch::executor::testing::TensorFactory;
 
 namespace {
diff --git a/kernels/test/op_embedding_test.cpp b/kernels/test/op_embedding_test.cpp
index 6c48996a4b..abee1be43e 100644
--- a/kernels/test/op_embedding_test.cpp
+++ b/kernels/test/op_embedding_test.cpp
@@ -17,9 +17,9 @@
 #include <gtest/gtest.h>
 
 using namespace ::testing;
-using exec_aten::ArrayRef;
-using exec_aten::ScalarType;
-using exec_aten::Tensor;
+using executorch::aten::ArrayRef;
+using executorch::aten::ScalarType;
+using executorch::aten::Tensor;
 using torch::executor::testing::TensorFactory;
 
 class OpEmbeddingOutTest : public OperatorTest {
@@ -41,7 +41,7 @@ class OpEmbeddingOutTest : public OperatorTest {
         out);
   }
 
-  template <class CTYPE, exec_aten::ScalarType DTYPE>
+  template <class CTYPE, executorch::aten::ScalarType DTYPE>
   void test_dtype() {
     TensorFactory<DTYPE> tf;
     TensorFactory<ScalarType::Long> tfl;
diff --git a/kernels/test/op_empty_test.cpp b/kernels/test/op_empty_test.cpp
index 5d40b97392..6e77b4e1f0 100644
--- a/kernels/test/op_empty_test.cpp
+++ b/kernels/test/op_empty_test.cpp
@@ -17,11 +17,11 @@
 #include <gtest/gtest.h>
 
 using namespace ::testing;
-using exec_aten::IntArrayRef;
-using exec_aten::MemoryFormat;
-using exec_aten::optional;
-using exec_aten::ScalarType;
-using exec_aten::Tensor;
+using executorch::aten::IntArrayRef;
+using executorch::aten::MemoryFormat;
+using executorch::aten::optional;
+using executorch::aten::ScalarType;
+using executorch::aten::Tensor;
 using torch::executor::testing::TensorFactory;
 
 class OpEmptyOutTest : public OperatorTest {
@@ -38,7 +38,7 @@ class OpEmptyOutTest : public OperatorTest {
   void test_empty_out(std::vector<int32_t>&& size_int32_t) {
     TensorFactory<DTYPE> tf;
     std::vector<int64_t> sizes(size_int32_t.begin(), size_int32_t.end());
-    auto aref = exec_aten::ArrayRef<int64_t>(sizes.data(), sizes.size());
+    auto aref = executorch::aten::ArrayRef<int64_t>(sizes.data(), sizes.size());
     optional<MemoryFormat> memory_format;
     Tensor out = tf.ones(size_int32_t);
 
@@ -59,7 +59,7 @@ TEST_F(OpEmptyOutTest, DynamicShapeUpperBoundSameAsExpected) {
   TensorFactory<ScalarType::Float> tf;
 
   int64_t sizes[2] = {3, 2};
-  auto sizes_aref = exec_aten::ArrayRef<int64_t>(sizes);
+  auto sizes_aref = executorch::aten::ArrayRef<int64_t>(sizes);
   optional<MemoryFormat> memory_format;
   Tensor out =
       tf.ones({3, 2}, torch::executor::TensorShapeDynamism::DYNAMIC_BOUND);
@@ -70,7 +70,7 @@ TEST_F(OpEmptyOutTest, DynamicShapeUpperBoundLargerThanExpected) {
   TensorFactory<ScalarType::Float> tf;
 
   int64_t sizes[2] = {3, 2};
-  auto sizes_aref = exec_aten::ArrayRef<int64_t>(sizes);
+  auto sizes_aref = executorch::aten::ArrayRef<int64_t>(sizes);
   optional<MemoryFormat> memory_format;
   Tensor out =
       tf.ones({10, 10}, torch::executor::TensorShapeDynamism::DYNAMIC_BOUND);
@@ -84,7 +84,7 @@ TEST_F(OpEmptyOutTest, DynamicShapeUnbound) {
   TensorFactory<ScalarType::Float> tf;
 
   int64_t sizes[2] = {3, 2};
-  auto sizes_aref = exec_aten::ArrayRef<int64_t>(sizes);
+  auto sizes_aref = executorch::aten::ArrayRef<int64_t>(sizes);
   optional<MemoryFormat> memory_format;
   Tensor out =
       tf.ones({1, 1}, torch::executor::TensorShapeDynamism::DYNAMIC_UNBOUND);
diff --git a/kernels/test/op_eq_test.cpp b/kernels/test/op_eq_test.cpp
index ef7c247ded..24cf9e6cf8 100644
--- a/kernels/test/op_eq_test.cpp
+++ b/kernels/test/op_eq_test.cpp
@@ -16,9 +16,9 @@
 #include <gtest/gtest.h>
 
 using namespace ::testing;
-using exec_aten::Scalar;
-using exec_aten::ScalarType;
-using exec_aten::Tensor;
+using executorch::aten::Scalar;
+using executorch::aten::ScalarType;
+using executorch::aten::Tensor;
 using torch::executor::testing::TensorFactory;
 
 class OpEqScalarOutTest : public OperatorTest {
diff --git a/kernels/test/op_erf_test.cpp b/kernels/test/op_erf_test.cpp
index 2b54a707d0..565a93dab0 100644
--- a/kernels/test/op_erf_test.cpp
+++ b/kernels/test/op_erf_test.cpp
@@ -16,8 +16,8 @@
 #include <gtest/gtest.h>
 
 using namespace ::testing;
-using exec_aten::ScalarType;
-using exec_aten::Tensor;
+using executorch::aten::ScalarType;
+using executorch::aten::Tensor;
 using torch::executor::testing::TensorFactory;
 
 class OpErfOutTest
diff --git a/kernels/test/op_exp_test.cpp b/kernels/test/op_exp_test.cpp
index 07e10e33b0..33ce721580 100644
--- a/kernels/test/op_exp_test.cpp
+++ b/kernels/test/op_exp_test.cpp
@@ -13,7 +13,7 @@
 
 #include <cmath>
 
-using exec_aten::Tensor;
+using executorch::aten::Tensor;
 class OpExpOutTest
     : public torch::executor::testing::UnaryUfuncRealHBBF16ToFloatHBF16Test {
  protected:
diff --git a/kernels/test/op_expand_copy_test.cpp b/kernels/test/op_expand_copy_test.cpp
index 737e83f294..b90a19a7c0 100644
--- a/kernels/test/op_expand_copy_test.cpp
+++ b/kernels/test/op_expand_copy_test.cpp
@@ -17,10 +17,10 @@
 #include <gtest/gtest.h>
 
 using namespace ::testing;
-using exec_aten::IntArrayRef;
-using exec_aten::Scalar;
-using exec_aten::ScalarType;
-using exec_aten::Tensor;
+using executorch::aten::IntArrayRef;
+using executorch::aten::Scalar;
+using executorch::aten::ScalarType;
+using executorch::aten::Tensor;
 using torch::executor::testing::TensorFactory;
 
 class OpExpandOutTest : public OperatorTest {
diff --git a/kernels/test/op_expm1_test.cpp b/kernels/test/op_expm1_test.cpp
index b91b8544f7..772e5b4205 100644
--- a/kernels/test/op_expm1_test.cpp
+++ b/kernels/test/op_expm1_test.cpp
@@ -13,7 +13,7 @@
 
 #include <cmath>
 
-using exec_aten::Tensor;
+using executorch::aten::Tensor;
 class OpExpm1OutTest
     : public torch::executor::testing::UnaryUfuncRealHBBF16ToFloatHBF16Test {
  protected:
diff --git a/kernels/test/op_fill_test.cpp b/kernels/test/op_fill_test.cpp
index a16cbed66c..ac45ae307a 100644
--- a/kernels/test/op_fill_test.cpp
+++ b/kernels/test/op_fill_test.cpp
@@ -16,9 +16,9 @@
 #include <gtest/gtest.h>
 
 using namespace ::testing;
-using exec_aten::Scalar;
-using exec_aten::ScalarType;
-using exec_aten::Tensor;
+using executorch::aten::Scalar;
+using executorch::aten::ScalarType;
+using executorch::aten::Tensor;
 using torch::executor::testing::TensorFactory;
 
 class OpFillTest : public OperatorTest {
diff --git a/kernels/test/op_flip_test.cpp b/kernels/test/op_flip_test.cpp
index 3301a243b5..f240dfd4ad 100644
--- a/kernels/test/op_flip_test.cpp
+++ b/kernels/test/op_flip_test.cpp
@@ -16,9 +16,9 @@
 #include <gtest/gtest.h>
 
 using namespace ::testing;
-using exec_aten::IntArrayRef;
-using exec_aten::ScalarType;
-using exec_aten::Tensor;
+using executorch::aten::IntArrayRef;
+using executorch::aten::ScalarType;
+using executorch::aten::Tensor;
 using torch::executor::testing::TensorFactory;
 
 Tensor& op_flip_out(const Tensor& input, IntArrayRef dims, Tensor& out) {
diff --git a/kernels/test/op_floor_divide_test.cpp b/kernels/test/op_floor_divide_test.cpp
index 4dc337cd3a..d871b8d521 100644
--- a/kernels/test/op_floor_divide_test.cpp
+++ b/kernels/test/op_floor_divide_test.cpp
@@ -16,8 +16,8 @@
 #include <gtest/gtest.h>
 
 using namespace ::testing;
-using exec_aten::ScalarType;
-using exec_aten::Tensor;
+using executorch::aten::ScalarType;
+using executorch::aten::Tensor;
 using torch::executor::testing::TensorFactory;
 
 class OpFloorDivideTest : public OperatorTest {
diff --git a/kernels/test/op_floor_test.cpp b/kernels/test/op_floor_test.cpp
index 4f39f77dae..0a1b66496b 100644
--- a/kernels/test/op_floor_test.cpp
+++ b/kernels/test/op_floor_test.cpp
@@ -16,8 +16,8 @@
 #include <gtest/gtest.h>
 
 using namespace ::testing;
-using exec_aten::ScalarType;
-using exec_aten::Tensor;
+using executorch::aten::ScalarType;
+using executorch::aten::Tensor;
 using torch::executor::testing::TensorFactory;
 
 class OpFloorTest : public OperatorTest {
diff --git a/kernels/test/op_fmod_test.cpp b/kernels/test/op_fmod_test.cpp
index 4ee4d84c1c..fa7cc4b63f 100644
--- a/kernels/test/op_fmod_test.cpp
+++ b/kernels/test/op_fmod_test.cpp
@@ -15,9 +15,9 @@
 #include <gtest/gtest.h>
 
 using namespace ::testing;
-using exec_aten::Scalar;
-using exec_aten::ScalarType;
-using exec_aten::Tensor;
+using executorch::aten::Scalar;
+using executorch::aten::ScalarType;
+using executorch::aten::Tensor;
 using torch::executor::testing::TensorFactory;
 
 class OpFmodTest : public OperatorTest {
diff --git a/kernels/test/op_full_like_test.cpp b/kernels/test/op_full_like_test.cpp
index 6a892c8160..84f2fc554f 100644
--- a/kernels/test/op_full_like_test.cpp
+++ b/kernels/test/op_full_like_test.cpp
@@ -17,11 +17,11 @@
 #include <gtest/gtest.h>
 
 using namespace ::testing;
-using exec_aten::MemoryFormat;
-using exec_aten::optional;
-using exec_aten::Scalar;
-using exec_aten::ScalarType;
-using exec_aten::Tensor;
+using executorch::aten::MemoryFormat;
+using executorch::aten::optional;
+using executorch::aten::Scalar;
+using executorch::aten::ScalarType;
+using executorch::aten::Tensor;
 using torch::executor::testing::TensorFactory;
 
 class OpFullLikeTest : public OperatorTest {
diff --git a/kernels/test/op_full_test.cpp b/kernels/test/op_full_test.cpp
index 318d57e346..67a897933c 100644
--- a/kernels/test/op_full_test.cpp
+++ b/kernels/test/op_full_test.cpp
@@ -17,12 +17,12 @@
 #include <gtest/gtest.h>
 
 using namespace ::testing;
-using exec_aten::IntArrayRef;
-using exec_aten::MemoryFormat;
-using exec_aten::optional;
-using exec_aten::Scalar;
-using exec_aten::ScalarType;
-using exec_aten::Tensor;
+using executorch::aten::IntArrayRef;
+using executorch::aten::MemoryFormat;
+using executorch::aten::optional;
+using executorch::aten::Scalar;
+using executorch::aten::ScalarType;
+using executorch::aten::Tensor;
 using torch::executor::testing::TensorFactory;
 
 class OpFullOutTest : public OperatorTest {
diff --git a/kernels/test/op_gather_test.cpp b/kernels/test/op_gather_test.cpp
index 24d3b740d2..ff67d9b8fd 100644
--- a/kernels/test/op_gather_test.cpp
+++ b/kernels/test/op_gather_test.cpp
@@ -17,9 +17,9 @@
 #include <cmath>
 
 using namespace ::testing;
-using exec_aten::Scalar;
-using exec_aten::ScalarType;
-using exec_aten::Tensor;
+using executorch::aten::Scalar;
+using executorch::aten::ScalarType;
+using executorch::aten::Tensor;
 using torch::executor::testing::TensorFactory;
 
 class OpGatherOutTest : public OperatorTest {
diff --git a/kernels/test/op_ge_test.cpp b/kernels/test/op_ge_test.cpp
index 21f21dfbfd..4b21644a5c 100644
--- a/kernels/test/op_ge_test.cpp
+++ b/kernels/test/op_ge_test.cpp
@@ -15,9 +15,9 @@
 #include <gtest/gtest.h>
 
 using namespace ::testing;
-using exec_aten::Scalar;
-using exec_aten::ScalarType;
-using exec_aten::Tensor;
+using executorch::aten::Scalar;
+using executorch::aten::ScalarType;
+using executorch::aten::Tensor;
 using executorch::runtime::KernelRuntimeContext;
 using torch::executor::testing::TensorFactory;
 
diff --git a/kernels/test/op_gelu_test.cpp b/kernels/test/op_gelu_test.cpp
index 3334b9acff..e7a150c7a3 100644
--- a/kernels/test/op_gelu_test.cpp
+++ b/kernels/test/op_gelu_test.cpp
@@ -16,9 +16,9 @@
 #include <gtest/gtest.h>
 
 using namespace ::testing;
-using exec_aten::ScalarType;
-using exec_aten::string_view;
-using exec_aten::Tensor;
+using executorch::aten::ScalarType;
+using executorch::aten::string_view;
+using executorch::aten::Tensor;
 using torch::executor::testing::SupportedFeatures;
 using torch::executor::testing::TensorFactory;
 
diff --git a/kernels/test/op_glu_test.cpp b/kernels/test/op_glu_test.cpp
index 63e06da4c1..f8bf22dae6 100644
--- a/kernels/test/op_glu_test.cpp
+++ b/kernels/test/op_glu_test.cpp
@@ -17,9 +17,9 @@
 #include <cmath>
 
 using namespace ::testing;
-using exec_aten::Scalar;
-using exec_aten::ScalarType;
-using exec_aten::Tensor;
+using executorch::aten::Scalar;
+using executorch::aten::ScalarType;
+using executorch::aten::Tensor;
 using torch::executor::testing::TensorFactory;
 
 class OpGluOutTest : public OperatorTest {
diff --git a/kernels/test/op_gt_test.cpp b/kernels/test/op_gt_test.cpp
index 140c08ae27..29a2fb0e8b 100644
--- a/kernels/test/op_gt_test.cpp
+++ b/kernels/test/op_gt_test.cpp
@@ -15,9 +15,9 @@
 #include <gtest/gtest.h>
 
 using namespace ::testing;
-using exec_aten::Scalar;
-using exec_aten::ScalarType;
-using exec_aten::Tensor;
+using executorch::aten::Scalar;
+using executorch::aten::ScalarType;
+using executorch::aten::Tensor;
 using executorch::runtime::KernelRuntimeContext;
 using torch::executor::testing::TensorFactory;
 
diff --git a/kernels/test/op_hardtanh_test.cpp b/kernels/test/op_hardtanh_test.cpp
index ba60a3e39f..72d09063d3 100644
--- a/kernels/test/op_hardtanh_test.cpp
+++ b/kernels/test/op_hardtanh_test.cpp
@@ -15,9 +15,9 @@
 #include <gtest/gtest.h>
 
 using namespace ::testing;
-using exec_aten::Scalar;
-using exec_aten::ScalarType;
-using exec_aten::Tensor;
+using executorch::aten::Scalar;
+using executorch::aten::ScalarType;
+using executorch::aten::Tensor;
 using torch::executor::testing::TensorFactory;
 
 class OpHardTanhTest : public OperatorTest {
diff --git a/kernels/test/op_index_put_test.cpp b/kernels/test/op_index_put_test.cpp
index e5a6a024a5..967760576d 100644
--- a/kernels/test/op_index_put_test.cpp
+++ b/kernels/test/op_index_put_test.cpp
@@ -17,10 +17,10 @@
 #include <sys/types.h>
 
 using namespace ::testing;
-using exec_aten::ArrayRef;
-using exec_aten::optional;
-using exec_aten::ScalarType;
-using exec_aten::Tensor;
+using executorch::aten::ArrayRef;
+using executorch::aten::optional;
+using executorch::aten::ScalarType;
+using executorch::aten::Tensor;
 using torch::executor::testing::TensorFactory;
 
 using OptTensorArrayRef = ArrayRef<optional<Tensor>>;
@@ -44,8 +44,8 @@ class OpIndexPutOutTest : public OperatorTest {
   }
 
   template <
-      exec_aten::ScalarType INPUT_DTYPE,
-      exec_aten::ScalarType INDICES_DTYPE>
+      executorch::aten::ScalarType INPUT_DTYPE,
+      executorch::aten::ScalarType INDICES_DTYPE>
   void test_dtype() {
     TensorFactory<INPUT_DTYPE> tf;
     TensorFactory<INDICES_DTYPE> tfl;
diff --git a/kernels/test/op_index_select_test.cpp b/kernels/test/op_index_select_test.cpp
index 279335f0ea..33adf16668 100644
--- a/kernels/test/op_index_select_test.cpp
+++ b/kernels/test/op_index_select_test.cpp
@@ -18,9 +18,9 @@
 #include <sys/types.h>
 
 using namespace ::testing;
-using exec_aten::ArrayRef;
-using exec_aten::ScalarType;
-using exec_aten::Tensor;
+using executorch::aten::ArrayRef;
+using executorch::aten::ScalarType;
+using executorch::aten::Tensor;
 using torch::executor::testing::TensorFactory;
 
 class OpIndexSelectOutTest : public OperatorTest {
@@ -34,7 +34,7 @@ class OpIndexSelectOutTest : public OperatorTest {
         context_, self, dim, index, out);
   }
 
-  template <class CTYPE, exec_aten::ScalarType DTYPE>
+  template <class CTYPE, executorch::aten::ScalarType DTYPE>
   void test_dtype() {
     TensorFactory<DTYPE> tf;
     TensorFactory<ScalarType::Long> tfl;
diff --git a/kernels/test/op_index_test.cpp b/kernels/test/op_index_test.cpp
index 8afa0f9345..ab17a92bfb 100644
--- a/kernels/test/op_index_test.cpp
+++ b/kernels/test/op_index_test.cpp
@@ -18,10 +18,10 @@
 #include <sys/types.h>
 
 using namespace ::testing;
-using exec_aten::ArrayRef;
-using exec_aten::optional;
-using exec_aten::ScalarType;
-using exec_aten::Tensor;
+using executorch::aten::ArrayRef;
+using executorch::aten::optional;
+using executorch::aten::ScalarType;
+using executorch::aten::Tensor;
 using torch::executor::testing::TensorFactory;
 
 using OptTensorArrayRef = ArrayRef<optional<Tensor>>;
@@ -42,9 +42,9 @@ class OpIndexTensorOutTest : public OperatorTest {
   }
 
   template <
-      exec_aten::ScalarType INPUT_DTYPE,
-      exec_aten::ScalarType INDEX_DTYPE,
-      exec_aten::ScalarType OUTPUT_DTYPE>
+      executorch::aten::ScalarType INPUT_DTYPE,
+      executorch::aten::ScalarType INDEX_DTYPE,
+      executorch::aten::ScalarType OUTPUT_DTYPE>
   void test_dtype() {
     TensorFactory<INPUT_DTYPE> tf;
     TensorFactory<INDEX_DTYPE> tfl;
diff --git a/kernels/test/op_isinf_test.cpp b/kernels/test/op_isinf_test.cpp
index d791698437..a535896006 100644
--- a/kernels/test/op_isinf_test.cpp
+++ b/kernels/test/op_isinf_test.cpp
@@ -16,8 +16,8 @@
 #include <gtest/gtest.h>
 
 using namespace ::testing;
-using exec_aten::ScalarType;
-using exec_aten::Tensor;
+using executorch::aten::ScalarType;
+using executorch::aten::Tensor;
 using torch::executor::testing::TensorFactory;
 
 class OpIsInfTest : public OperatorTest {
diff --git a/kernels/test/op_isnan_test.cpp b/kernels/test/op_isnan_test.cpp
index c63fc838ea..13bdc74635 100644
--- a/kernels/test/op_isnan_test.cpp
+++ b/kernels/test/op_isnan_test.cpp
@@ -16,8 +16,8 @@
 #include <gtest/gtest.h>
 
 using namespace ::testing;
-using exec_aten::ScalarType;
-using exec_aten::Tensor;
+using executorch::aten::ScalarType;
+using executorch::aten::Tensor;
 using torch::executor::testing::TensorFactory;
 
 class OpIsNanTest : public OperatorTest {
diff --git a/kernels/test/op_le_test.cpp b/kernels/test/op_le_test.cpp
index ab437327ba..ab766d2c6c 100644
--- a/kernels/test/op_le_test.cpp
+++ b/kernels/test/op_le_test.cpp
@@ -15,9 +15,9 @@
 #include <gtest/gtest.h>
 
 using namespace ::testing;
-using exec_aten::Scalar;
-using exec_aten::ScalarType;
-using exec_aten::Tensor;
+using executorch::aten::Scalar;
+using executorch::aten::ScalarType;
+using executorch::aten::Tensor;
 using executorch::runtime::KernelRuntimeContext;
 using torch::executor::testing::TensorFactory;
 
diff --git a/kernels/test/op_leaky_relu_test.cpp b/kernels/test/op_leaky_relu_test.cpp
index 514c7dc6b5..847c00652b 100644
--- a/kernels/test/op_leaky_relu_test.cpp
+++ b/kernels/test/op_leaky_relu_test.cpp
@@ -15,9 +15,9 @@
 #include <gtest/gtest.h>
 
 using namespace ::testing;
-using exec_aten::Scalar;
-using exec_aten::ScalarType;
-using exec_aten::Tensor;
+using executorch::aten::Scalar;
+using executorch::aten::ScalarType;
+using executorch::aten::Tensor;
 using torch::executor::testing::TensorFactory;
 
 class OpLeakyReluTest : public OperatorTest {
diff --git a/kernels/test/op_lift_fresh_copy_test.cpp b/kernels/test/op_lift_fresh_copy_test.cpp
index a63b2970f4..39f88e11aa 100644
--- a/kernels/test/op_lift_fresh_copy_test.cpp
+++ b/kernels/test/op_lift_fresh_copy_test.cpp
@@ -16,9 +16,9 @@
 #include <gtest/gtest.h>
 
 using namespace ::testing;
-using exec_aten::optional;
-using exec_aten::ScalarType;
-using exec_aten::Tensor;
+using executorch::aten::optional;
+using executorch::aten::ScalarType;
+using executorch::aten::Tensor;
 using torch::executor::testing::TensorFactory;
 
 class OpLiftFreshCopyTest : public OperatorTest {
@@ -28,7 +28,7 @@ class OpLiftFreshCopyTest : public OperatorTest {
   }
 
   // test if lift_fresh_copy.out works well under all kinds of legal input type.
-  template <class CTYPE, exec_aten::ScalarType DTYPE>
+  template <class CTYPE, executorch::aten::ScalarType DTYPE>
   void test_dtype() {
     TensorFactory<DTYPE> tf;
     Tensor self = tf.ones(/*sizes=*/{2, 4});
diff --git a/kernels/test/op_linear_test.cpp b/kernels/test/op_linear_test.cpp
index 47f8925af0..d894c5a818 100644
--- a/kernels/test/op_linear_test.cpp
+++ b/kernels/test/op_linear_test.cpp
@@ -19,10 +19,10 @@
 #include <limits>
 
 using namespace ::testing;
-using exec_aten::ArrayRef;
-using exec_aten::Scalar;
-using exec_aten::ScalarType;
-using exec_aten::Tensor;
+using executorch::aten::ArrayRef;
+using executorch::aten::Scalar;
+using executorch::aten::ScalarType;
+using executorch::aten::Tensor;
 using torch::executor::testing::TensorFactory;
 
 class OpLinearOutTest : public OperatorTest {
@@ -31,7 +31,7 @@ class OpLinearOutTest : public OperatorTest {
     return torch::executor::aten::linear_outf(context_, self, mat2, {}, out);
   }
 
-  template <class CTYPE, exec_aten::ScalarType DTYPE>
+  template <class CTYPE, executorch::aten::ScalarType DTYPE>
   void test_dtype() {
     TensorFactory<DTYPE> tf;
 
diff --git a/kernels/test/op_log10_test.cpp b/kernels/test/op_log10_test.cpp
index a5aa574992..f1fa7ba85b 100644
--- a/kernels/test/op_log10_test.cpp
+++ b/kernels/test/op_log10_test.cpp
@@ -13,7 +13,7 @@
 
 #include <cmath>
 
-using exec_aten::Tensor;
+using executorch::aten::Tensor;
 class OpLog10OutTest
     : public torch::executor::testing::UnaryUfuncRealHBBF16ToFloatHBF16Test {
  protected:
diff --git a/kernels/test/op_log1p_test.cpp b/kernels/test/op_log1p_test.cpp
index d4e195b5c8..a93b0ea694 100644
--- a/kernels/test/op_log1p_test.cpp
+++ b/kernels/test/op_log1p_test.cpp
@@ -13,7 +13,7 @@
 
 #include <cmath>
 
-using exec_aten::Tensor;
+using executorch::aten::Tensor;
 class OpLog1pOutTest
     : public torch::executor::testing::UnaryUfuncRealHBBF16ToFloatHBF16Test {
  protected:
diff --git a/kernels/test/op_log2_test.cpp b/kernels/test/op_log2_test.cpp
index e8d36559a3..1786ea961c 100644
--- a/kernels/test/op_log2_test.cpp
+++ b/kernels/test/op_log2_test.cpp
@@ -13,7 +13,7 @@
 
 #include <cmath>
 
-using exec_aten::Tensor;
+using executorch::aten::Tensor;
 class OpLog2OutTest
     : public torch::executor::testing::UnaryUfuncRealHBBF16ToFloatHBF16Test {
  protected:
diff --git a/kernels/test/op_log_softmax_test.cpp b/kernels/test/op_log_softmax_test.cpp
index ca1f5e7ae6..94047592a8 100644
--- a/kernels/test/op_log_softmax_test.cpp
+++ b/kernels/test/op_log_softmax_test.cpp
@@ -17,9 +17,9 @@
 #include <gtest/gtest.h>
 
 using namespace ::testing;
-using exec_aten::ArrayRef;
-using exec_aten::ScalarType;
-using exec_aten::Tensor;
+using executorch::aten::ArrayRef;
+using executorch::aten::ScalarType;
+using executorch::aten::Tensor;
 using torch::executor::testing::SupportedFeatures;
 using torch::executor::testing::TensorFactory;
 
@@ -35,7 +35,7 @@ class OpLogSoftmaxOutTest : public OperatorTest {
   }
 
   // A generic smoke test that works for the supported dtypes.
-  template <class CTYPE, exec_aten::ScalarType DTYPE>
+  template <class CTYPE, executorch::aten::ScalarType DTYPE>
   void test_dtype() {
     TensorFactory<DTYPE> tf;
 
diff --git a/kernels/test/op_log_test.cpp b/kernels/test/op_log_test.cpp
index f8ac831e74..1ea6ef8dcd 100644
--- a/kernels/test/op_log_test.cpp
+++ b/kernels/test/op_log_test.cpp
@@ -13,8 +13,8 @@
 
 #include <cmath>
 
-using exec_aten::ScalarType;
-using exec_aten::Tensor;
+using executorch::aten::ScalarType;
+using executorch::aten::Tensor;
 using torch::executor::testing::SupportedFeatures;
 using torch::executor::testing::TensorFactory;
 class OpLogOutTest
diff --git a/kernels/test/op_logical_and_test.cpp b/kernels/test/op_logical_and_test.cpp
index 454b2f0d66..f16c763688 100644
--- a/kernels/test/op_logical_and_test.cpp
+++ b/kernels/test/op_logical_and_test.cpp
@@ -11,7 +11,7 @@
 
 #include <gtest/gtest.h>
 
-using exec_aten::Tensor;
+using executorch::aten::Tensor;
 
 class OpLogicalAndTest : public torch::executor::testing::BinaryLogicalOpTest {
  protected:
diff --git a/kernels/test/op_logical_not_test.cpp b/kernels/test/op_logical_not_test.cpp
index 1b23b93b03..ad453db18f 100644
--- a/kernels/test/op_logical_not_test.cpp
+++ b/kernels/test/op_logical_not_test.cpp
@@ -17,10 +17,10 @@
 #include <cmath>
 
 using namespace ::testing;
-using exec_aten::ArrayRef;
-using exec_aten::optional;
-using exec_aten::ScalarType;
-using exec_aten::Tensor;
+using executorch::aten::ArrayRef;
+using executorch::aten::optional;
+using executorch::aten::ScalarType;
+using executorch::aten::Tensor;
 using torch::executor::testing::TensorFactory;
 
 class OpLogicalNotOutTest : public OperatorTest {
diff --git a/kernels/test/op_logical_or_test.cpp b/kernels/test/op_logical_or_test.cpp
index 1f966a7212..c7e1bda94e 100644
--- a/kernels/test/op_logical_or_test.cpp
+++ b/kernels/test/op_logical_or_test.cpp
@@ -11,7 +11,7 @@
 
 #include <gtest/gtest.h>
 
-using exec_aten::Tensor;
+using executorch::aten::Tensor;
 
 class OpLogicalOrTest : public torch::executor::testing::BinaryLogicalOpTest {
  protected:
diff --git a/kernels/test/op_logical_xor_test.cpp b/kernels/test/op_logical_xor_test.cpp
index b1fdddcc1f..62b57fca85 100644
--- a/kernels/test/op_logical_xor_test.cpp
+++ b/kernels/test/op_logical_xor_test.cpp
@@ -11,7 +11,7 @@
 
 #include <gtest/gtest.h>
 
-using exec_aten::Tensor;
+using executorch::aten::Tensor;
 
 class OpLogicalXorTest : public torch::executor::testing::BinaryLogicalOpTest {
  protected:
diff --git a/kernels/test/op_logit_test.cpp b/kernels/test/op_logit_test.cpp
index 5efd38caba..a5e78e8aa6 100644
--- a/kernels/test/op_logit_test.cpp
+++ b/kernels/test/op_logit_test.cpp
@@ -16,9 +16,9 @@
 #include <gtest/gtest.h>
 
 using namespace ::testing;
-using exec_aten::optional;
-using exec_aten::ScalarType;
-using exec_aten::Tensor;
+using executorch::aten::optional;
+using executorch::aten::ScalarType;
+using executorch::aten::Tensor;
 using torch::executor::testing::TensorFactory;
 
 class OpLogitOutTest : public OperatorTest {
@@ -57,10 +57,17 @@ class OpLogitOutTest : public OperatorTest {
 
     op_logit_out(tf.make(sizes, /*data=*/{1, 2, 4, 8}), 0.1, out);
 
-    // Check that it matches (or close to) the expected output.
-    EXPECT_TENSOR_CLOSE(
-        out,
-        tf_out.make(sizes, /*data=*/{2.197224, 2.197224, 2.197224, 2.197224}));
+    auto expected =
+        tf_out.make(sizes, /*data=*/{2.197224, 2.197224, 2.197224, 2.197224});
+    if (DTYPE == ScalarType::Half || DTYPE == ScalarType::BFloat16) {
+      EXPECT_TENSOR_CLOSE_WITH_TOL(
+          out,
+          expected,
+          1e-2,
+          executorch::runtime::testing::internal::kDefaultAtol);
+    } else {
+      EXPECT_TENSOR_CLOSE(out, expected);
+    }
   }
 
   // Unhandled output dtypes.
diff --git a/kernels/test/op_lt_test.cpp b/kernels/test/op_lt_test.cpp
index 45767bcd0b..51ccb310e4 100644
--- a/kernels/test/op_lt_test.cpp
+++ b/kernels/test/op_lt_test.cpp
@@ -15,9 +15,9 @@
 #include <gtest/gtest.h>
 
 using namespace ::testing;
-using exec_aten::Scalar;
-using exec_aten::ScalarType;
-using exec_aten::Tensor;
+using executorch::aten::Scalar;
+using executorch::aten::ScalarType;
+using executorch::aten::Tensor;
 using executorch::runtime::KernelRuntimeContext;
 using torch::executor::testing::TensorFactory;
 
diff --git a/kernels/test/op_masked_fill_test.cpp b/kernels/test/op_masked_fill_test.cpp
index 0c08c2b781..41962ba5ed 100644
--- a/kernels/test/op_masked_fill_test.cpp
+++ b/kernels/test/op_masked_fill_test.cpp
@@ -16,9 +16,9 @@
 #include <gtest/gtest.h>
 
 using namespace ::testing;
-using exec_aten::Scalar;
-using exec_aten::ScalarType;
-using exec_aten::Tensor;
+using executorch::aten::Scalar;
+using executorch::aten::ScalarType;
+using executorch::aten::Tensor;
 using torch::executor::testing::TensorFactory;
 
 class OpMaskedFillTest : public OperatorTest {
diff --git a/kernels/test/op_masked_scatter_test.cpp b/kernels/test/op_masked_scatter_test.cpp
index 9116fe71a6..6d8cf37094 100644
--- a/kernels/test/op_masked_scatter_test.cpp
+++ b/kernels/test/op_masked_scatter_test.cpp
@@ -16,8 +16,8 @@
 #include <gtest/gtest.h>
 
 using namespace ::testing;
-using exec_aten::ScalarType;
-using exec_aten::Tensor;
+using executorch::aten::ScalarType;
+using executorch::aten::Tensor;
 using torch::executor::testing::SupportedFeatures;
 using torch::executor::testing::TensorFactory;
 
diff --git a/kernels/test/op_masked_select_test.cpp b/kernels/test/op_masked_select_test.cpp
index 2a7791e9c1..dd000e4efe 100644
--- a/kernels/test/op_masked_select_test.cpp
+++ b/kernels/test/op_masked_select_test.cpp
@@ -16,8 +16,8 @@
 #include <gtest/gtest.h>
 
 using namespace ::testing;
-using exec_aten::ScalarType;
-using exec_aten::Tensor;
+using executorch::aten::ScalarType;
+using executorch::aten::Tensor;
 using torch::executor::testing::SupportedFeatures;
 using torch::executor::testing::TensorFactory;
 
diff --git a/kernels/test/op_max_pool2d_with_indices_test.cpp b/kernels/test/op_max_pool2d_with_indices_test.cpp
index 46f232521e..0dce810449 100644
--- a/kernels/test/op_max_pool2d_with_indices_test.cpp
+++ b/kernels/test/op_max_pool2d_with_indices_test.cpp
@@ -19,16 +19,16 @@ using namespace ::testing;
 
 class OpMaxPool2DWithIndicesOutTest : public OperatorTest {
  protected:
-  ::std::tuple<exec_aten::Tensor&, exec_aten::Tensor&>
+  ::std::tuple<executorch::aten::Tensor&, executorch::aten::Tensor&>
   op_max_pool2d_with_indices_out(
-      const exec_aten::Tensor& self,
-      exec_aten::ArrayRef<int64_t> kernel_size,
-      exec_aten::ArrayRef<int64_t> stride,
-      exec_aten::ArrayRef<int64_t> padding,
-      exec_aten::ArrayRef<int64_t> dilation,
+      const executorch::aten::Tensor& self,
+      executorch::aten::ArrayRef<int64_t> kernel_size,
+      executorch::aten::ArrayRef<int64_t> stride,
+      executorch::aten::ArrayRef<int64_t> padding,
+      executorch::aten::ArrayRef<int64_t> dilation,
       bool ceil_mode,
-      exec_aten::Tensor& out,
-      exec_aten::Tensor& indices) {
+      executorch::aten::Tensor& out,
+      executorch::aten::Tensor& indices) {
     return torch::executor::aten::max_pool2d_with_indices_outf(
         context_,
         self,
@@ -40,80 +40,96 @@ class OpMaxPool2DWithIndicesOutTest : public OperatorTest {
         out,
         indices);
   }
+
+  template <executorch::aten::ScalarType DTYPE>
+  void test_4d_dtype() {
+    torch::executor::testing::TensorFactory<DTYPE> tf;
+    torch::executor::testing::TensorFactory<executorch::aten::ScalarType::Long>
+        tfLong;
+
+    executorch::aten::Tensor self = tf.make(
+        {2, 3, 5, 5},
+        {28.75,   -38.875, -7.0,    -13.5,   70.75,   53.75,   69.625,  97.375,
+         25.375,  99.5,    -72.125, -87.25,  79.25,   42.0,    -24.75,  -15.5,
+         12.5,    -86.0,   85.5,    -0.25,   67.125,  77.0,    53.375,  -61.125,
+         50.0,    3.875,   42.25,   -37.375, 51.0,    -60.875, 87.0,    32.25,
+         73.5,    68.875,  -84.375, -98.75,  -30.125, 94.25,   1.625,   -86.25,
+         -56.5,   -68.0,   74.25,   -51.25,  8.125,   71.375,  -53.125, 4.875,
+         77.5,    -89.875, 4.5,     -46.5,   -46.375, -92.625, -85.5,   -23.0,
+         -8.875,  -12.0,   -46.625, -88.625, 66.75,   87.75,   90.25,   -45.0,
+         -78.125, 63.25,   28.75,   28.125,  -30.375, 17.75,   -16.0,   5.0,
+         11.125,  88.625,  -47.625, 72.25,   32.0,    -7.625,  61.625,  -63.125,
+         -22.75,  83.125,  -40.375, -78.25,  49.5,    -39.125, -89.625, 47.875,
+         -61.375, 7.75,    16.875,  -96.375, -22.5,   8.5,     74.25,   12.75,
+         90.125,  73.875,  -71.75,  -10.0,   41.25,   1.125,   10.375,  -34.625,
+         29.75,   -27.5,   26.625,  81.0,    -8.875,  17.625,  84.375,  -23.625,
+         -53.875, -26.0,   -67.375, -90.75,  16.375,  45.625,  99.5,    56.25,
+         -87.625, -65.5,   -79.75,  31.875,  79.75,   6.375,   44.625,  -55.25,
+         -5.5,    -68.875, -38.625, 54.125,  -3.125,  5.75,    29.25,   -39.5,
+         26.75,   68.25,   -24.625, -53.0,   51.0,    90.625,  65.375,  43.875,
+         90.875,  -41.625, 99.875,  6.375,   -31.25,  -94.0});
+    ::std::vector<int64_t> kernel_size_vec = {2, 2};
+    executorch::aten::ArrayRef<int64_t> kernel_size =
+        executorch::aten::ArrayRef<int64_t>(
+            kernel_size_vec.data(), kernel_size_vec.size());
+    ::std::vector<int64_t> stride_vec = {1, 1};
+    executorch::aten::ArrayRef<int64_t> stride =
+        executorch::aten::ArrayRef<int64_t>(
+            stride_vec.data(), stride_vec.size());
+    ::std::vector<int64_t> padding_vec = {0, 0};
+    executorch::aten::ArrayRef<int64_t> padding =
+        executorch::aten::ArrayRef<int64_t>(
+            padding_vec.data(), padding_vec.size());
+    ::std::vector<int64_t> dilation_vec = {1, 1};
+    executorch::aten::ArrayRef<int64_t> dilation =
+        executorch::aten::ArrayRef<int64_t>(
+            dilation_vec.data(), dilation_vec.size());
+    bool ceil_mode = false;
+    executorch::aten::Tensor out = tf.zeros({2, 3, 4, 4});
+    executorch::aten::Tensor indices = tfLong.zeros({2, 3, 4, 4});
+    executorch::aten::Tensor out_expected = tf.make(
+        {2, 3, 4, 4},
+        {69.625,  97.375, 97.375, 99.5,    69.625, 97.375, 97.375, 99.5,
+         12.5,    79.25,  85.5,   85.5,    77.0,   77.0,   85.5,   85.5,
+         87.0,    73.5,   73.5,   68.875,  87.0,   94.25,  94.25,  68.875,
+         -30.125, 94.25,  94.25,  8.125,   71.375, 74.25,  77.5,   77.5,
+         4.5,     -8.875, -12.0,  -46.625, 87.75,  90.25,  90.25,  -45.0,
+         87.75,   90.25,  90.25,  17.75,   63.25,  28.75,  88.625, 88.625,
+         83.125,  83.125, 61.625, 61.625,  83.125, 83.125, 47.875, 49.5,
+         16.875,  47.875, 47.875, 74.25,   90.125, 90.125, 73.875, 74.25,
+         41.25,   81.0,   81.0,   29.75,   84.375, 81.0,   81.0,   17.625,
+         84.375,  45.625, 99.5,   99.5,    16.375, 45.625, 99.5,   99.5,
+         54.125,  54.125, 5.75,   29.25,   54.125, 68.25,  68.25,  29.25,
+         90.625,  90.625, 68.25,  90.875,  99.875, 99.875, 65.375, 90.875});
+    executorch::aten::Tensor indices_expected = tfLong.make(
+        {2, 3, 4, 4},
+        {6, 7, 7, 9, 6,  7,  7,  9,  16, 12, 18, 18, 21, 21, 18, 18,
+         5, 7, 7, 8, 5,  12, 12, 8,  11, 12, 12, 19, 20, 17, 23, 23,
+         0, 6, 7, 8, 11, 12, 12, 13, 11, 12, 12, 19, 15, 16, 23, 23,
+         6, 6, 3, 3, 6,  6,  12, 9,  15, 12, 12, 19, 21, 21, 22, 19,
+         0, 7, 7, 4, 10, 7,  7,  9,  10, 17, 18, 18, 16, 17, 18, 18,
+         6, 6, 8, 9, 6,  12, 12, 9,  16, 16, 12, 19, 21, 21, 17, 19});
+    op_max_pool2d_with_indices_out(
+        self, kernel_size, stride, padding, dilation, ceil_mode, out, indices);
+    EXPECT_TENSOR_CLOSE(out, out_expected);
+    EXPECT_TENSOR_CLOSE(indices, indices_expected);
+  }
 };
 
 TEST_F(OpMaxPool2DWithIndicesOutTest, SanityTest4D) {
-  torch::executor::testing::TensorFactory<exec_aten::ScalarType::Float> tfFloat;
-  torch::executor::testing::TensorFactory<exec_aten::ScalarType::Long> tfLong;
-
-  exec_aten::Tensor self = tfFloat.make(
-      {2, 3, 5, 5},
-      {28.75,   -38.875, -7.0,    -13.5,   70.75,   53.75,   69.625,  97.375,
-       25.375,  99.5,    -72.125, -87.25,  79.25,   42.0,    -24.75,  -15.5,
-       12.5,    -86.0,   85.5,    -0.25,   67.125,  77.0,    53.375,  -61.125,
-       50.0,    3.875,   42.25,   -37.375, 51.0,    -60.875, 87.0,    32.25,
-       73.5,    68.875,  -84.375, -98.75,  -30.125, 94.25,   1.625,   -86.25,
-       -56.5,   -68.0,   74.25,   -51.25,  8.125,   71.375,  -53.125, 4.875,
-       77.5,    -89.875, 4.5,     -46.5,   -46.375, -92.625, -85.5,   -23.0,
-       -8.875,  -12.0,   -46.625, -88.625, 66.75,   87.75,   90.25,   -45.0,
-       -78.125, 63.25,   28.75,   28.125,  -30.375, 17.75,   -16.0,   5.0,
-       11.125,  88.625,  -47.625, 72.25,   32.0,    -7.625,  61.625,  -63.125,
-       -22.75,  83.125,  -40.375, -78.25,  49.5,    -39.125, -89.625, 47.875,
-       -61.375, 7.75,    16.875,  -96.375, -22.5,   8.5,     74.25,   12.75,
-       90.125,  73.875,  -71.75,  -10.0,   41.25,   1.125,   10.375,  -34.625,
-       29.75,   -27.5,   26.625,  81.0,    -8.875,  17.625,  84.375,  -23.625,
-       -53.875, -26.0,   -67.375, -90.75,  16.375,  45.625,  99.5,    56.25,
-       -87.625, -65.5,   -79.75,  31.875,  79.75,   6.375,   44.625,  -55.25,
-       -5.5,    -68.875, -38.625, 54.125,  -3.125,  5.75,    29.25,   -39.5,
-       26.75,   68.25,   -24.625, -53.0,   51.0,    90.625,  65.375,  43.875,
-       90.875,  -41.625, 99.875,  6.375,   -31.25,  -94.0});
-  ::std::vector<int64_t> kernel_size_vec = {2, 2};
-  exec_aten::ArrayRef<int64_t> kernel_size = exec_aten::ArrayRef<int64_t>(
-      kernel_size_vec.data(), kernel_size_vec.size());
-  ::std::vector<int64_t> stride_vec = {1, 1};
-  exec_aten::ArrayRef<int64_t> stride =
-      exec_aten::ArrayRef<int64_t>(stride_vec.data(), stride_vec.size());
-  ::std::vector<int64_t> padding_vec = {0, 0};
-  exec_aten::ArrayRef<int64_t> padding =
-      exec_aten::ArrayRef<int64_t>(padding_vec.data(), padding_vec.size());
-  ::std::vector<int64_t> dilation_vec = {1, 1};
-  exec_aten::ArrayRef<int64_t> dilation =
-      exec_aten::ArrayRef<int64_t>(dilation_vec.data(), dilation_vec.size());
-  bool ceil_mode = false;
-  exec_aten::Tensor out = tfFloat.zeros({2, 3, 4, 4});
-  exec_aten::Tensor indices = tfLong.zeros({2, 3, 4, 4});
-  exec_aten::Tensor out_expected = tfFloat.make(
-      {2, 3, 4, 4},
-      {69.625, 97.375, 97.375, 99.5,   69.625, 97.375, 97.375,  99.5,   12.5,
-       79.25,  85.5,   85.5,   77.0,   77.0,   85.5,   85.5,    87.0,   73.5,
-       73.5,   68.875, 87.0,   94.25,  94.25,  68.875, -30.125, 94.25,  94.25,
-       8.125,  71.375, 74.25,  77.5,   77.5,   4.5,    -8.875,  -12.0,  -46.625,
-       87.75,  90.25,  90.25,  -45.0,  87.75,  90.25,  90.25,   17.75,  63.25,
-       28.75,  88.625, 88.625, 83.125, 83.125, 61.625, 61.625,  83.125, 83.125,
-       47.875, 49.5,   16.875, 47.875, 47.875, 74.25,  90.125,  90.125, 73.875,
-       74.25,  41.25,  81.0,   81.0,   29.75,  84.375, 81.0,    81.0,   17.625,
-       84.375, 45.625, 99.5,   99.5,   16.375, 45.625, 99.5,    99.5,   54.125,
-       54.125, 5.75,   29.25,  54.125, 68.25,  68.25,  29.25,   90.625, 90.625,
-       68.25,  90.875, 99.875, 99.875, 65.375, 90.875});
-  exec_aten::Tensor indices_expected = tfLong.make(
-      {2, 3, 4, 4},
-      {6, 7, 7, 9, 6,  7,  7,  9,  16, 12, 18, 18, 21, 21, 18, 18,
-       5, 7, 7, 8, 5,  12, 12, 8,  11, 12, 12, 19, 20, 17, 23, 23,
-       0, 6, 7, 8, 11, 12, 12, 13, 11, 12, 12, 19, 15, 16, 23, 23,
-       6, 6, 3, 3, 6,  6,  12, 9,  15, 12, 12, 19, 21, 21, 22, 19,
-       0, 7, 7, 4, 10, 7,  7,  9,  10, 17, 18, 18, 16, 17, 18, 18,
-       6, 6, 8, 9, 6,  12, 12, 9,  16, 16, 12, 19, 21, 21, 17, 19});
-  op_max_pool2d_with_indices_out(
-      self, kernel_size, stride, padding, dilation, ceil_mode, out, indices);
-  EXPECT_TENSOR_CLOSE(out, out_expected);
-  EXPECT_TENSOR_CLOSE(indices, indices_expected);
+#define TEST_ENTRY(ctype, dtype) \
+  test_4d_dtype<executorch::aten::ScalarType::dtype>();
+  ET_FORALL_FLOATHBF16_TYPES(TEST_ENTRY);
+#undef TEST_ENTRY
 }
 
 TEST_F(OpMaxPool2DWithIndicesOutTest, SanityTest4D_2) {
-  torch::executor::testing::TensorFactory<exec_aten::ScalarType::Float> tfFloat;
-  torch::executor::testing::TensorFactory<exec_aten::ScalarType::Long> tfLong;
+  torch::executor::testing::TensorFactory<executorch::aten::ScalarType::Float>
+      tfFloat;
+  torch::executor::testing::TensorFactory<executorch::aten::ScalarType::Long>
+      tfLong;
 
-  exec_aten::Tensor self = tfFloat.make(
+  executorch::aten::Tensor self = tfFloat.make(
       {2, 3, 8, 8},
       {47.375,  -18.625, 12.0,    5.375,   -40.375, -75.875, 51.0,    -48.25,
        -5.0,    -50.625, -96.875, -53.25,  82.25,   0.125,   -13.125, 89.75,
@@ -164,21 +180,24 @@ TEST_F(OpMaxPool2DWithIndicesOutTest, SanityTest4D_2) {
        58.0,    18.125,  -94.75,  -69.375, 70.375,  -51.75,  -86.75,  81.5,
        75.75,   61.625,  -14.5,   -60.75,  -58.125, -3.25,   36.25,   -95.125});
   ::std::vector<int64_t> kernel_size_vec = {2, 3};
-  exec_aten::ArrayRef<int64_t> kernel_size = exec_aten::ArrayRef<int64_t>(
-      kernel_size_vec.data(), kernel_size_vec.size());
+  executorch::aten::ArrayRef<int64_t> kernel_size =
+      executorch::aten::ArrayRef<int64_t>(
+          kernel_size_vec.data(), kernel_size_vec.size());
   ::std::vector<int64_t> stride_vec = {2, 1};
-  exec_aten::ArrayRef<int64_t> stride =
-      exec_aten::ArrayRef<int64_t>(stride_vec.data(), stride_vec.size());
+  executorch::aten::ArrayRef<int64_t> stride =
+      executorch::aten::ArrayRef<int64_t>(stride_vec.data(), stride_vec.size());
   ::std::vector<int64_t> padding_vec = {0, 1};
-  exec_aten::ArrayRef<int64_t> padding =
-      exec_aten::ArrayRef<int64_t>(padding_vec.data(), padding_vec.size());
+  executorch::aten::ArrayRef<int64_t> padding =
+      executorch::aten::ArrayRef<int64_t>(
+          padding_vec.data(), padding_vec.size());
   ::std::vector<int64_t> dilation_vec = {2, 1};
-  exec_aten::ArrayRef<int64_t> dilation =
-      exec_aten::ArrayRef<int64_t>(dilation_vec.data(), dilation_vec.size());
+  executorch::aten::ArrayRef<int64_t> dilation =
+      executorch::aten::ArrayRef<int64_t>(
+          dilation_vec.data(), dilation_vec.size());
   bool ceil_mode = false;
-  exec_aten::Tensor out = tfFloat.zeros({2, 3, 3, 8});
-  exec_aten::Tensor indices = tfLong.zeros({2, 3, 3, 8});
-  exec_aten::Tensor out_expected = tfFloat.make(
+  executorch::aten::Tensor out = tfFloat.zeros({2, 3, 3, 8});
+  executorch::aten::Tensor indices = tfLong.zeros({2, 3, 3, 8});
+  executorch::aten::Tensor out_expected = tfFloat.make(
       {2, 3, 3, 8},
       {67.125, 79.625, 79.625, 79.625, 77.75,  77.75,  51.0,    51.0,    86.5,
        86.5,   93.25,  93.25,  93.25,  77.75,  57.5,   57.5,    92.75,   92.75,
@@ -196,7 +215,7 @@ TEST_F(OpMaxPool2DWithIndicesOutTest, SanityTest4D_2) {
        48.625, 14.25,  14.25,  84.5,   84.5,   26.0,   91.75,   91.75,   91.75,
        66.5,   66.5,   84.5,   84.5,   54.75,  70.0,   70.0,    70.0,    66.5,
        66.5,   58.0,   58.0,   54.75,  70.375, 70.375, 70.375,  81.5,    81.5});
-  exec_aten::Tensor indices_expected = tfLong.make(
+  executorch::aten::Tensor indices_expected = tfLong.make(
       {2, 3, 3, 8},
       {17, 18, 18, 18, 20, 20, 6,  6,  33, 33, 35, 35, 35, 20, 38, 38, 48, 48,
        35, 35, 35, 38, 38, 38, 17, 17, 17, 4,  21, 21, 21, 7,  33, 33, 35, 35,
@@ -213,10 +232,12 @@ TEST_F(OpMaxPool2DWithIndicesOutTest, SanityTest4D_2) {
 }
 
 TEST_F(OpMaxPool2DWithIndicesOutTest, SanityTest3D) {
-  torch::executor::testing::TensorFactory<exec_aten::ScalarType::Float> tfFloat;
-  torch::executor::testing::TensorFactory<exec_aten::ScalarType::Long> tfLong;
+  torch::executor::testing::TensorFactory<executorch::aten::ScalarType::Float>
+      tfFloat;
+  torch::executor::testing::TensorFactory<executorch::aten::ScalarType::Long>
+      tfLong;
 
-  exec_aten::Tensor self = tfFloat.make(
+  executorch::aten::Tensor self = tfFloat.make(
       {2, 12, 12},
       {73.625,  15.5,    30.875,  89.25,   -55.625, -62.875, 25.0,    -50.75,
        -47.125, 12.125,  -73.125, -89.875, 53.625,  -63.125, -44.375, 86.0,
@@ -255,21 +276,24 @@ TEST_F(OpMaxPool2DWithIndicesOutTest, SanityTest3D) {
        10.5,    -38.75,  1.625,   67.125,  3.0,     -87.0,   42.0,    -31.25,
        -77.875, -7.125,  -94.0,   -99.0,   24.75,   -21.625, -98.375, 15.875});
   ::std::vector<int64_t> kernel_size_vec = {4, 3};
-  exec_aten::ArrayRef<int64_t> kernel_size = exec_aten::ArrayRef<int64_t>(
-      kernel_size_vec.data(), kernel_size_vec.size());
+  executorch::aten::ArrayRef<int64_t> kernel_size =
+      executorch::aten::ArrayRef<int64_t>(
+          kernel_size_vec.data(), kernel_size_vec.size());
   ::std::vector<int64_t> stride_vec = {3, 2};
-  exec_aten::ArrayRef<int64_t> stride =
-      exec_aten::ArrayRef<int64_t>(stride_vec.data(), stride_vec.size());
+  executorch::aten::ArrayRef<int64_t> stride =
+      executorch::aten::ArrayRef<int64_t>(stride_vec.data(), stride_vec.size());
   ::std::vector<int64_t> padding_vec = {2, 1};
-  exec_aten::ArrayRef<int64_t> padding =
-      exec_aten::ArrayRef<int64_t>(padding_vec.data(), padding_vec.size());
+  executorch::aten::ArrayRef<int64_t> padding =
+      executorch::aten::ArrayRef<int64_t>(
+          padding_vec.data(), padding_vec.size());
   ::std::vector<int64_t> dilation_vec = {1, 2};
-  exec_aten::ArrayRef<int64_t> dilation =
-      exec_aten::ArrayRef<int64_t>(dilation_vec.data(), dilation_vec.size());
+  executorch::aten::ArrayRef<int64_t> dilation =
+      executorch::aten::ArrayRef<int64_t>(
+          dilation_vec.data(), dilation_vec.size());
   bool ceil_mode = false;
-  exec_aten::Tensor out = tfFloat.zeros({2, 5, 5});
-  exec_aten::Tensor indices = tfLong.zeros({2, 5, 5});
-  exec_aten::Tensor out_expected = tfFloat.make(
+  executorch::aten::Tensor out = tfFloat.zeros({2, 5, 5});
+  executorch::aten::Tensor indices = tfLong.zeros({2, 5, 5});
+  executorch::aten::Tensor out_expected = tfFloat.make(
       {2, 5, 5},
       {89.25,  89.25,  89.25,  20.125, 20.125, 89.875, 89.875, 86.0,   49.125,
        80.875, 89.875, 89.875, 99.375, 99.375, 99.375, 84.875, 84.875, 86.875,
@@ -277,7 +301,7 @@ TEST_F(OpMaxPool2DWithIndicesOutTest, SanityTest3D) {
        85.375, 85.375, 85.375, 75.875, 75.875, 42.5,   42.5,   74.625, 75.875,
        98.0,   98.0,   98.0,   61.25,  95.125, 98.0,   98.0,   98.0,   93.875,
        88.125, 88.125, 13.125, 13.125, 67.125});
-  exec_aten::Tensor indices_expected = tfLong.make(
+  executorch::aten::Tensor indices_expected = tfLong.make(
       {2, 5, 5}, {3,  3,  3,   19,  19,  49,  49,  15,  29,  35,  49,  49,  79,
                   79, 79, 111, 111, 103, 103, 103, 121, 137, 137, 137, 143, 3,
                   5,  7,  7,   7,   49,  49,  31,  31,  23,  49,  89,  89,  89,
@@ -289,10 +313,12 @@ TEST_F(OpMaxPool2DWithIndicesOutTest, SanityTest3D) {
 }
 
 TEST_F(OpMaxPool2DWithIndicesOutTest, CeilMode) {
-  torch::executor::testing::TensorFactory<exec_aten::ScalarType::Float> tfFloat;
-  torch::executor::testing::TensorFactory<exec_aten::ScalarType::Long> tfLong;
+  torch::executor::testing::TensorFactory<executorch::aten::ScalarType::Float>
+      tfFloat;
+  torch::executor::testing::TensorFactory<executorch::aten::ScalarType::Long>
+      tfLong;
 
-  exec_aten::Tensor self = tfFloat.make(
+  executorch::aten::Tensor self = tfFloat.make(
       {2, 7, 7}, {-7, -9,  -6,  -8,  -9, -9,  -6,  -10, -7,  -6,  -10, -7, -10,
                   -7, -8,  -10, -6,  -8, -8,  -10, -9,  -8,  -6,  -8,  -9, -8,
                   -8, -8,  -6,  -9,  -9, -8,  -8,  -8,  -8,  -7,  -7,  -6, -7,
@@ -304,22 +330,25 @@ TEST_F(OpMaxPool2DWithIndicesOutTest, CeilMode) {
                   -6, -9,  -9,  -8,  -8, -8,  -9,  -9,  -10, -8});
 
   ::std::vector<int64_t> kernel_size_vec = {2, 2};
-  exec_aten::ArrayRef<int64_t> kernel_size = exec_aten::ArrayRef<int64_t>(
-      kernel_size_vec.data(), kernel_size_vec.size());
+  executorch::aten::ArrayRef<int64_t> kernel_size =
+      executorch::aten::ArrayRef<int64_t>(
+          kernel_size_vec.data(), kernel_size_vec.size());
   ::std::vector<int64_t> stride_vec = {2, 2};
-  exec_aten::ArrayRef<int64_t> stride =
-      exec_aten::ArrayRef<int64_t>(stride_vec.data(), stride_vec.size());
+  executorch::aten::ArrayRef<int64_t> stride =
+      executorch::aten::ArrayRef<int64_t>(stride_vec.data(), stride_vec.size());
   ::std::vector<int64_t> padding_vec = {0, 0};
-  exec_aten::ArrayRef<int64_t> padding =
-      exec_aten::ArrayRef<int64_t>(padding_vec.data(), padding_vec.size());
+  executorch::aten::ArrayRef<int64_t> padding =
+      executorch::aten::ArrayRef<int64_t>(
+          padding_vec.data(), padding_vec.size());
   ::std::vector<int64_t> dilation_vec = {1, 1};
-  exec_aten::ArrayRef<int64_t> dilation =
-      exec_aten::ArrayRef<int64_t>(dilation_vec.data(), dilation_vec.size());
+  executorch::aten::ArrayRef<int64_t> dilation =
+      executorch::aten::ArrayRef<int64_t>(
+          dilation_vec.data(), dilation_vec.size());
 
   bool ceil_mode = true;
-  exec_aten::Tensor out = tfFloat.zeros({2, 4, 4});
-  exec_aten::Tensor indices = tfLong.zeros({2, 4, 4});
-  exec_aten::Tensor out_expected = tfFloat.make(
+  executorch::aten::Tensor out = tfFloat.zeros({2, 4, 4});
+  executorch::aten::Tensor indices = tfLong.zeros({2, 4, 4});
+  executorch::aten::Tensor out_expected = tfFloat.make(
       {2, 4, 4},
       {-7, -6,  -7, -6, -6, -6, -8, -8, -6, -6, -6, -8, -7, -6, -6, -6,
 
diff --git a/kernels/test/op_max_test.cpp b/kernels/test/op_max_test.cpp
index 67a6c6d466..72de22e60d 100644
--- a/kernels/test/op_max_test.cpp
+++ b/kernels/test/op_max_test.cpp
@@ -18,9 +18,9 @@
 #include <cmath>
 
 using namespace ::testing;
-using exec_aten::ArrayRef;
-using exec_aten::ScalarType;
-using exec_aten::Tensor;
+using executorch::aten::ArrayRef;
+using executorch::aten::ScalarType;
+using executorch::aten::Tensor;
 using torch::executor::testing::TensorFactory;
 
 class OpMaxOutTest : public OperatorTest {
diff --git a/kernels/test/op_maximum_test.cpp b/kernels/test/op_maximum_test.cpp
index 254725d634..9c701e208e 100644
--- a/kernels/test/op_maximum_test.cpp
+++ b/kernels/test/op_maximum_test.cpp
@@ -16,8 +16,8 @@
 #include <gtest/gtest.h>
 
 using namespace ::testing;
-using exec_aten::ScalarType;
-using exec_aten::Tensor;
+using executorch::aten::ScalarType;
+using executorch::aten::Tensor;
 using torch::executor::testing::TensorFactory;
 
 Tensor& op_maximum_out(const Tensor& self, const Tensor& other, Tensor& out) {
diff --git a/kernels/test/op_mean_test.cpp b/kernels/test/op_mean_test.cpp
index 2bc084eae5..898ffe6f8e 100644
--- a/kernels/test/op_mean_test.cpp
+++ b/kernels/test/op_mean_test.cpp
@@ -18,10 +18,10 @@
 #include <cmath>
 
 using namespace ::testing;
-using exec_aten::ArrayRef;
-using exec_aten::optional;
-using exec_aten::ScalarType;
-using exec_aten::Tensor;
+using executorch::aten::ArrayRef;
+using executorch::aten::optional;
+using executorch::aten::ScalarType;
+using executorch::aten::Tensor;
 using executorch::runtime::Error;
 using torch::executor::testing::TensorFactory;
 
diff --git a/kernels/test/op_min_test.cpp b/kernels/test/op_min_test.cpp
index 5ce3e9aefd..3d5e01c44d 100644
--- a/kernels/test/op_min_test.cpp
+++ b/kernels/test/op_min_test.cpp
@@ -18,9 +18,9 @@
 #include <cmath>
 
 using namespace ::testing;
-using exec_aten::ArrayRef;
-using exec_aten::ScalarType;
-using exec_aten::Tensor;
+using executorch::aten::ArrayRef;
+using executorch::aten::ScalarType;
+using executorch::aten::Tensor;
 using torch::executor::testing::TensorFactory;
 
 class OpMinOutTest : public OperatorTest {
diff --git a/kernels/test/op_minimum_test.cpp b/kernels/test/op_minimum_test.cpp
index 7e12374b8d..686e1feee6 100644
--- a/kernels/test/op_minimum_test.cpp
+++ b/kernels/test/op_minimum_test.cpp
@@ -16,8 +16,8 @@
 #include <gtest/gtest.h>
 
 using namespace ::testing;
-using exec_aten::ScalarType;
-using exec_aten::Tensor;
+using executorch::aten::ScalarType;
+using executorch::aten::Tensor;
 using torch::executor::testing::TensorFactory;
 
 class OpMinimumOutTest : public OperatorTest {
diff --git a/kernels/test/op_mm_test.cpp b/kernels/test/op_mm_test.cpp
index c05792523f..63d06143b5 100644
--- a/kernels/test/op_mm_test.cpp
+++ b/kernels/test/op_mm_test.cpp
@@ -19,10 +19,10 @@
 #include <limits>
 
 using namespace ::testing;
-using exec_aten::ArrayRef;
-using exec_aten::Scalar;
-using exec_aten::ScalarType;
-using exec_aten::Tensor;
+using executorch::aten::ArrayRef;
+using executorch::aten::Scalar;
+using executorch::aten::ScalarType;
+using executorch::aten::Tensor;
 using torch::executor::testing::TensorFactory;
 
 class OpMmOutTest : public OperatorTest {
@@ -31,7 +31,7 @@ class OpMmOutTest : public OperatorTest {
     return torch::executor::aten::mm_outf(context_, self, mat2, out);
   }
 
-  template <class CTYPE, exec_aten::ScalarType DTYPE>
+  template <class CTYPE, executorch::aten::ScalarType DTYPE>
   void test_dtype() {
     TensorFactory<DTYPE> tf;
 
diff --git a/kernels/test/op_narrow_copy_test.cpp b/kernels/test/op_narrow_copy_test.cpp
index e453e46500..264f3e129a 100644
--- a/kernels/test/op_narrow_copy_test.cpp
+++ b/kernels/test/op_narrow_copy_test.cpp
@@ -15,10 +15,10 @@
 #include <gtest/gtest.h>
 
 using namespace ::testing;
-using exec_aten::ArrayRef;
-using exec_aten::optional;
-using exec_aten::ScalarType;
-using exec_aten::Tensor;
+using executorch::aten::ArrayRef;
+using executorch::aten::optional;
+using executorch::aten::ScalarType;
+using executorch::aten::Tensor;
 using torch::executor::testing::TensorFactory;
 
 class OpNarrowCopyOutTest : public OperatorTest {
@@ -33,7 +33,7 @@ class OpNarrowCopyOutTest : public OperatorTest {
         context_, in, dim, start, length, out);
   }
 
-  template <class CTYPE, exec_aten::ScalarType DTYPE>
+  template <class CTYPE, executorch::aten::ScalarType DTYPE>
   void test_dtype() {
     TensorFactory<DTYPE> tf;
 
diff --git a/kernels/test/op_native_batch_norm_test.cpp b/kernels/test/op_native_batch_norm_test.cpp
index 8c9581c66d..67e46b2750 100644
--- a/kernels/test/op_native_batch_norm_test.cpp
+++ b/kernels/test/op_native_batch_norm_test.cpp
@@ -19,18 +19,21 @@ using namespace ::testing;
 
 class OpNativeBatchNormLegitNoTrainingOutTest : public OperatorTest {
  protected:
-  ::std::tuple<exec_aten::Tensor&, exec_aten::Tensor&, exec_aten::Tensor&>
+  ::std::tuple<
+      executorch::aten::Tensor&,
+      executorch::aten::Tensor&,
+      executorch::aten::Tensor&>
   op_native_batch_norm_legit_no_training_out(
-      const exec_aten::Tensor& input,
-      const exec_aten::optional<exec_aten::Tensor>& weight,
-      const exec_aten::optional<exec_aten::Tensor>& bias,
-      const exec_aten::Tensor& running_mean,
-      const exec_aten::Tensor& running_var,
+      const executorch::aten::Tensor& input,
+      const executorch::aten::optional<executorch::aten::Tensor>& weight,
+      const executorch::aten::optional<executorch::aten::Tensor>& bias,
+      const executorch::aten::Tensor& running_mean,
+      const executorch::aten::Tensor& running_var,
       double momentum,
       double eps,
-      exec_aten::Tensor& out0,
-      exec_aten::Tensor& out1,
-      exec_aten::Tensor& out2) {
+      executorch::aten::Tensor& out0,
+      executorch::aten::Tensor& out1,
+      executorch::aten::Tensor& out2) {
     return torch::executor::aten::_native_batch_norm_legit_no_training_outf(
         context_,
         input,
@@ -45,11 +48,11 @@ class OpNativeBatchNormLegitNoTrainingOutTest : public OperatorTest {
         out2);
   }
 
-  template <exec_aten::ScalarType DTYPE>
+  template <executorch::aten::ScalarType DTYPE>
   void test_2d_dtype() {
     torch::executor::testing::TensorFactory<DTYPE> tf;
 
-    exec_aten::Tensor input = tf.make(
+    executorch::aten::Tensor input = tf.make(
         {4, 7}, {2.876736640930176,  7.67944860458374,   5.701690196990967,
                  9.299789428710938,  3.023690700531006,  5.315116882324219,
                  7.185585021972656,  6.911304473876953,  7.61051082611084,
@@ -60,8 +63,8 @@ class OpNativeBatchNormLegitNoTrainingOutTest : public OperatorTest {
                  9.971039772033691,  3.5423521995544434, 7.452159881591797,
                  9.93700122833252,   1.8560808897018433, 1.524025797843933,
                  7.3222975730896});
-    exec_aten::optional<exec_aten::Tensor> weight =
-        exec_aten::optional<exec_aten::Tensor>(tf.make(
+    executorch::aten::optional<executorch::aten::Tensor> weight =
+        executorch::aten::optional<executorch::aten::Tensor>(tf.make(
             {7},
             {8.287437438964844,
              8.227645874023438,
@@ -70,8 +73,8 @@ class OpNativeBatchNormLegitNoTrainingOutTest : public OperatorTest {
              4.119281768798828,
              8.593960762023926,
              2.3760855197906494}));
-    exec_aten::optional<exec_aten::Tensor> bias =
-        exec_aten::optional<exec_aten::Tensor>(tf.make(
+    executorch::aten::optional<executorch::aten::Tensor> bias =
+        executorch::aten::optional<executorch::aten::Tensor>(tf.make(
             {7},
             {7.824275970458984,
              6.84327507019043,
@@ -80,7 +83,7 @@ class OpNativeBatchNormLegitNoTrainingOutTest : public OperatorTest {
              3.89609694480896,
              3.0753469467163086,
              3.1105971336364746}));
-    exec_aten::Tensor running_mean = tf.make(
+    executorch::aten::Tensor running_mean = tf.make(
         {7},
         {9.700226783752441,
          0.1234668493270874,
@@ -89,7 +92,7 @@ class OpNativeBatchNormLegitNoTrainingOutTest : public OperatorTest {
          0.4736626148223877,
          7.7135701179504395,
          5.12320613861084});
-    exec_aten::Tensor running_var = tf.make(
+    executorch::aten::Tensor running_var = tf.make(
         {7},
         {3.585531234741211,
          6.615292549133301,
@@ -100,10 +103,10 @@ class OpNativeBatchNormLegitNoTrainingOutTest : public OperatorTest {
          1.5029621124267578});
     double momentum = 0.1;
     double eps = 0;
-    exec_aten::Tensor out0 = tf.zeros({4, 7});
-    exec_aten::Tensor out1 = tf.zeros({0});
-    exec_aten::Tensor out2 = tf.zeros({0});
-    exec_aten::Tensor out0_expected = tf.make(
+    executorch::aten::Tensor out0 = tf.zeros({4, 7});
+    executorch::aten::Tensor out1 = tf.zeros({0});
+    executorch::aten::Tensor out2 = tf.zeros({0});
+    executorch::aten::Tensor out0_expected = tf.make(
         {4, 7}, {-22.039867401123047, 31.014127731323242,  -16.416650772094727,
                  10.04538631439209,   17.5877628326416,    -5.17673921585083,
                  7.1078033447265625,  -4.381907939910889,  30.793603897094727,
@@ -114,8 +117,8 @@ class OpNativeBatchNormLegitNoTrainingOutTest : public OperatorTest {
                  9.00953483581543,    17.779958724975586,  7.335818767547607,
                  12.688335418701172,  11.318607330322266,  -18.22031593322754,
                  7.372773170471191});
-    exec_aten::Tensor out1_expected = tf.make({0}, {});
-    exec_aten::Tensor out2_expected = tf.make({0}, {});
+    executorch::aten::Tensor out1_expected = tf.make({0}, {});
+    executorch::aten::Tensor out2_expected = tf.make({0}, {});
     op_native_batch_norm_legit_no_training_out(
         input,
         weight,
@@ -127,8 +130,8 @@ class OpNativeBatchNormLegitNoTrainingOutTest : public OperatorTest {
         out0,
         out1,
         out2);
-    if (DTYPE == exec_aten::ScalarType::Half ||
-        DTYPE == exec_aten::ScalarType::BFloat16) {
+    if (DTYPE == executorch::aten::ScalarType::Half ||
+        DTYPE == executorch::aten::ScalarType::BFloat16) {
       EXPECT_TENSOR_CLOSE_WITH_TOL(
           out0,
           out0_expected,
@@ -154,19 +157,22 @@ class OpNativeBatchNormLegitNoTrainingOutTest : public OperatorTest {
 
 class OpNativeBatchNormLegitOutTest : public OperatorTest {
  protected:
-  ::std::tuple<exec_aten::Tensor&, exec_aten::Tensor&, exec_aten::Tensor&>
+  ::std::tuple<
+      executorch::aten::Tensor&,
+      executorch::aten::Tensor&,
+      executorch::aten::Tensor&>
   op_native_batch_norm_legit_out(
-      const exec_aten::Tensor& input,
-      const exec_aten::optional<exec_aten::Tensor>& weight,
-      const exec_aten::optional<exec_aten::Tensor>& bias,
-      exec_aten::Tensor& running_mean,
-      exec_aten::Tensor& running_var,
+      const executorch::aten::Tensor& input,
+      const executorch::aten::optional<executorch::aten::Tensor>& weight,
+      const executorch::aten::optional<executorch::aten::Tensor>& bias,
+      executorch::aten::Tensor& running_mean,
+      executorch::aten::Tensor& running_var,
       bool training,
       double momentum,
       double eps,
-      exec_aten::Tensor& out0,
-      exec_aten::Tensor& out1,
-      exec_aten::Tensor& out2) {
+      executorch::aten::Tensor& out0,
+      executorch::aten::Tensor& out1,
+      executorch::aten::Tensor& out2) {
     executorch::runtime::KernelRuntimeContext context{};
     return torch::executor::aten::_native_batch_norm_legit_outf(
         context,
@@ -186,17 +192,20 @@ class OpNativeBatchNormLegitOutTest : public OperatorTest {
 
 class OpNativeBatchNormLegitNoStatsOutTest : public OperatorTest {
  protected:
-  ::std::tuple<exec_aten::Tensor&, exec_aten::Tensor&, exec_aten::Tensor&>
+  ::std::tuple<
+      executorch::aten::Tensor&,
+      executorch::aten::Tensor&,
+      executorch::aten::Tensor&>
   op_native_batch_norm_legit_no_stats_out(
-      const exec_aten::Tensor& input,
-      const exec_aten::optional<exec_aten::Tensor>& weight,
-      const exec_aten::optional<exec_aten::Tensor>& bias,
+      const executorch::aten::Tensor& input,
+      const executorch::aten::optional<executorch::aten::Tensor>& weight,
+      const executorch::aten::optional<executorch::aten::Tensor>& bias,
       bool training,
       double momentum,
       double eps,
-      exec_aten::Tensor& out0,
-      exec_aten::Tensor& out1,
-      exec_aten::Tensor& out2) {
+      executorch::aten::Tensor& out0,
+      executorch::aten::Tensor& out1,
+      executorch::aten::Tensor& out2) {
     return torch::executor::aten::_native_batch_norm_legit_outf(
         context_,
         input,
@@ -210,23 +219,23 @@ class OpNativeBatchNormLegitNoStatsOutTest : public OperatorTest {
         out2);
   }
 
-  template <exec_aten::ScalarType DTYPE>
+  template <executorch::aten::ScalarType DTYPE>
   void test_2d_dtype() {
     torch::executor::testing::TensorFactory<DTYPE> tf;
 
-    exec_aten::Tensor input =
+    executorch::aten::Tensor input =
         tf.make({3, 4}, {0, 1, 4, 9, 16, 25, 36, 49, 64, 81, 100, 121});
-    exec_aten::optional<exec_aten::Tensor> weight =
-        exec_aten::optional<exec_aten::Tensor>();
-    exec_aten::optional<exec_aten::Tensor> bias =
-        exec_aten::optional<exec_aten::Tensor>();
+    executorch::aten::optional<executorch::aten::Tensor> weight =
+        executorch::aten::optional<executorch::aten::Tensor>();
+    executorch::aten::optional<executorch::aten::Tensor> bias =
+        executorch::aten::optional<executorch::aten::Tensor>();
     bool training = true;
     double momentum = 1e-3;
     double eps = 1e-5;
-    exec_aten::Tensor out0 = tf.zeros({3, 4});
-    exec_aten::Tensor out1 = tf.zeros({4});
-    exec_aten::Tensor out2 = tf.zeros({4});
-    exec_aten::Tensor out0_expected = tf.make(
+    executorch::aten::Tensor out0 = tf.zeros({3, 4});
+    executorch::aten::Tensor out1 = tf.zeros({4});
+    executorch::aten::Tensor out2 = tf.zeros({4});
+    executorch::aten::Tensor out0_expected = tf.make(
         {3, 4},
         {-0.98058063,
          -1.03422451,
@@ -240,14 +249,14 @@ class OpNativeBatchNormLegitNoStatsOutTest : public OperatorTest {
          1.35244739,
          1.33630610,
          1.32350123});
-    exec_aten::Tensor out1_expected =
+    executorch::aten::Tensor out1_expected =
         tf.make({4}, {26.66666603, 35.66666794, 46.66666794, 59.66666794});
-    exec_aten::Tensor out2_expected =
+    executorch::aten::Tensor out2_expected =
         tf.make({4}, {0.03677177, 0.02983340, 0.02505574, 0.02157882});
     op_native_batch_norm_legit_no_stats_out(
         input, weight, bias, training, momentum, eps, out0, out1, out2);
-    if (DTYPE == exec_aten::ScalarType::Half ||
-        DTYPE == exec_aten::ScalarType::BFloat16) {
+    if (DTYPE == executorch::aten::ScalarType::Half ||
+        DTYPE == executorch::aten::ScalarType::BFloat16) {
       EXPECT_TENSOR_CLOSE_WITH_TOL(
           out0,
           out0_expected,
@@ -271,16 +280,18 @@ class OpNativeBatchNormLegitNoStatsOutTest : public OperatorTest {
   }
 };
 
-TEST_F(OpNativeBatchNormLegitNoTrainingOutTest, SampleAtomicTest2D){
-#define TEST_ENTRY(ctype, dtype) test_2d_dtype<exec_aten::ScalarType::dtype>();
-    ET_FORALL_FLOATHBF16_TYPES(TEST_ENTRY)
+TEST_F(OpNativeBatchNormLegitNoTrainingOutTest, SampleAtomicTest2D) {
+#define TEST_ENTRY(ctype, dtype) \
+  test_2d_dtype<executorch::aten::ScalarType::dtype>();
+  ET_FORALL_FLOATHBF16_TYPES(TEST_ENTRY)
 #undef TEST_ENTRY
 }
 
 TEST_F(OpNativeBatchNormLegitNoTrainingOutTest, SampleAtomicTest3D) {
-  torch::executor::testing::TensorFactory<exec_aten::ScalarType::Float> tfFloat;
+  torch::executor::testing::TensorFactory<executorch::aten::ScalarType::Float>
+      tfFloat;
 
-  exec_aten::Tensor input = tfFloat.make(
+  executorch::aten::Tensor input = tfFloat.make(
       {4, 7, 5}, {5.277339935302734,  5.94276237487793,     6.543086051940918,
                   2.411264181137085,  8.980886459350586,    2.7123653888702393,
                   9.466896057128906,  9.324702262878418,    1.9848430156707764,
@@ -328,8 +339,8 @@ TEST_F(OpNativeBatchNormLegitNoTrainingOutTest, SampleAtomicTest3D) {
                   3.887125253677368,  9.278786659240723,    6.742891311645508,
                   5.01821756362915,   2.326876640319824,    7.939553737640381,
                   3.2622408866882324, 3.829448699951172});
-  exec_aten::optional<exec_aten::Tensor> weight =
-      exec_aten::optional<exec_aten::Tensor>(tfFloat.make(
+  executorch::aten::optional<executorch::aten::Tensor> weight =
+      executorch::aten::optional<executorch::aten::Tensor>(tfFloat.make(
           {7},
           {0.5193436145782471,
            4.531304836273193,
@@ -338,8 +349,8 @@ TEST_F(OpNativeBatchNormLegitNoTrainingOutTest, SampleAtomicTest3D) {
            2.6848177909851074,
            7.309220314025879,
            2.2476916313171387}));
-  exec_aten::optional<exec_aten::Tensor> bias =
-      exec_aten::optional<exec_aten::Tensor>(tfFloat.make(
+  executorch::aten::optional<executorch::aten::Tensor> bias =
+      executorch::aten::optional<executorch::aten::Tensor>(tfFloat.make(
           {7},
           {4.643010139465332,
            0.2791440486907959,
@@ -348,7 +359,7 @@ TEST_F(OpNativeBatchNormLegitNoTrainingOutTest, SampleAtomicTest3D) {
            2.6499342918395996,
            5.721188545227051,
            5.901060104370117}));
-  exec_aten::Tensor running_mean = tfFloat.make(
+  executorch::aten::Tensor running_mean = tfFloat.make(
       {7},
       {5.818909645080566,
        5.325511932373047,
@@ -357,7 +368,7 @@ TEST_F(OpNativeBatchNormLegitNoTrainingOutTest, SampleAtomicTest3D) {
        5.608961582183838,
        3.7719011306762695,
        6.7734270095825195});
-  exec_aten::Tensor running_var = tfFloat.make(
+  executorch::aten::Tensor running_var = tfFloat.make(
       {7},
       {8.8593168258667,
        3.440363883972168,
@@ -368,10 +379,10 @@ TEST_F(OpNativeBatchNormLegitNoTrainingOutTest, SampleAtomicTest3D) {
        2.022289752960205});
   double momentum = 0.1;
   double eps = 0;
-  exec_aten::Tensor out0 = tfFloat.zeros({4, 7, 5});
-  exec_aten::Tensor out1 = tfFloat.zeros({0});
-  exec_aten::Tensor out2 = tfFloat.zeros({0});
-  exec_aten::Tensor out0_expected = tfFloat.make(
+  executorch::aten::Tensor out0 = tfFloat.zeros({4, 7, 5});
+  executorch::aten::Tensor out1 = tfFloat.zeros({0});
+  executorch::aten::Tensor out2 = tfFloat.zeros({0});
+  executorch::aten::Tensor out0_expected = tfFloat.make(
       {4, 7, 5}, {4.5485148429870605,  4.664620399475098,   4.76936674118042,
                   4.048431873321533,   5.194723129272461,   -6.104737281799316,
                   10.396490097045898,  10.049112319946289,  -7.8820648193359375,
@@ -419,8 +430,8 @@ TEST_F(OpNativeBatchNormLegitNoTrainingOutTest, SampleAtomicTest3D) {
                   6.116993427276611,   24.63783073425293,   15.926804542541504,
                   3.1268203258514404,  -1.1270453929901123, 7.744210720062256,
                   0.3513677716255188,  1.2478822469711304});
-  exec_aten::Tensor out1_expected = tfFloat.make({0}, {});
-  exec_aten::Tensor out2_expected = tfFloat.make({0}, {});
+  executorch::aten::Tensor out1_expected = tfFloat.make({0}, {});
+  executorch::aten::Tensor out2_expected = tfFloat.make({0}, {});
   op_native_batch_norm_legit_no_training_out(
       input,
       weight,
@@ -438,9 +449,10 @@ TEST_F(OpNativeBatchNormLegitNoTrainingOutTest, SampleAtomicTest3D) {
 }
 
 TEST_F(OpNativeBatchNormLegitNoTrainingOutTest, SampleAtomicTest4D) {
-  torch::executor::testing::TensorFactory<exec_aten::ScalarType::Float> tfFloat;
+  torch::executor::testing::TensorFactory<executorch::aten::ScalarType::Float>
+      tfFloat;
 
-  exec_aten::Tensor input = tfFloat.make(
+  executorch::aten::Tensor input = tfFloat.make(
       {2, 4, 5, 5},
       {8.0573148727417,     2.2901253700256348,  2.783101797103882,
        2.095468044281006,   6.389344215393066,   6.702191352844238,
@@ -509,27 +521,27 @@ TEST_F(OpNativeBatchNormLegitNoTrainingOutTest, SampleAtomicTest4D) {
        9.173870086669922,   3.781676769256592,   5.6734232902526855,
        3.301741600036621,   1.3799077272415161,  8.990988731384277,
        2.2520315647125244,  2.483280897140503});
-  exec_aten::optional<exec_aten::Tensor> weight =
-      exec_aten::optional<exec_aten::Tensor>(tfFloat.make(
+  executorch::aten::optional<executorch::aten::Tensor> weight =
+      executorch::aten::optional<executorch::aten::Tensor>(tfFloat.make(
           {4},
           {1.8311285972595215,
            5.851841926574707,
            6.108979225158691,
            5.1755266189575195}));
-  exec_aten::optional<exec_aten::Tensor> bias =
-      exec_aten::optional<exec_aten::Tensor>(tfFloat.make(
+  executorch::aten::optional<executorch::aten::Tensor> bias =
+      executorch::aten::optional<executorch::aten::Tensor>(tfFloat.make(
           {4},
           {5.1375732421875,
            3.7950849533081055,
            2.406358242034912,
            5.785604476928711}));
-  exec_aten::Tensor running_mean = tfFloat.make(
+  executorch::aten::Tensor running_mean = tfFloat.make(
       {4},
       {2.8203158378601074,
        3.1786017417907715,
        1.9189423322677612,
        1.8829244375228882});
-  exec_aten::Tensor running_var = tfFloat.make(
+  executorch::aten::Tensor running_var = tfFloat.make(
       {4},
       {1.4411485195159912,
        7.426868438720703,
@@ -537,10 +549,10 @@ TEST_F(OpNativeBatchNormLegitNoTrainingOutTest, SampleAtomicTest4D) {
        5.526189804077148});
   double momentum = 0.1;
   double eps = 0;
-  exec_aten::Tensor out0 = tfFloat.zeros({2, 4, 5, 5});
-  exec_aten::Tensor out1 = tfFloat.zeros({0});
-  exec_aten::Tensor out2 = tfFloat.zeros({0});
-  exec_aten::Tensor out0_expected = tfFloat.make(
+  executorch::aten::Tensor out0 = tfFloat.zeros({2, 4, 5, 5});
+  executorch::aten::Tensor out1 = tfFloat.zeros({0});
+  executorch::aten::Tensor out2 = tfFloat.zeros({0});
+  executorch::aten::Tensor out0_expected = tfFloat.make(
       {2, 4, 5, 5},
       {13.125737190246582,   4.328856468200684,    5.080809593200684,
        4.031939506530762,    10.581527709960938,   11.058723449707031,
@@ -609,8 +621,8 @@ TEST_F(OpNativeBatchNormLegitNoTrainingOutTest, SampleAtomicTest4D) {
        21.83746910095215,    9.96592903137207,     14.130828857421875,
        8.909295082092285,    4.678154945373535,    21.43483543395996,
        6.598236560821533,    7.107358932495117});
-  exec_aten::Tensor out1_expected = tfFloat.make({0}, {});
-  exec_aten::Tensor out2_expected = tfFloat.make({0}, {});
+  executorch::aten::Tensor out1_expected = tfFloat.make({0}, {});
+  executorch::aten::Tensor out2_expected = tfFloat.make({0}, {});
   op_native_batch_norm_legit_no_training_out(
       input,
       weight,
@@ -628,10 +640,10 @@ TEST_F(OpNativeBatchNormLegitNoTrainingOutTest, SampleAtomicTest4D) {
 }
 
 TEST_F(OpNativeBatchNormLegitNoTrainingOutTest, SampleAtomicTestDouble) {
-  torch::executor::testing::TensorFactory<exec_aten::ScalarType::Double>
+  torch::executor::testing::TensorFactory<executorch::aten::ScalarType::Double>
       tfDouble;
 
-  exec_aten::Tensor input = tfDouble.make(
+  executorch::aten::Tensor input = tfDouble.make(
       {3, 4, 3, 3},
       {0.09871780872344971, 5.7593607902526855,  4.542290687561035,
        9.888419151306152,   4.6276702880859375,  0.23040294647216797,
@@ -669,27 +681,27 @@ TEST_F(OpNativeBatchNormLegitNoTrainingOutTest, SampleAtomicTestDouble) {
        1.7936384677886963,  1.8733304738998413,  9.386192321777344,
        2.442445755004883,   2.2374587059020996,  1.6268903017044067,
        1.9272565841674805,  0.04978537559509277, 5.165012359619141});
-  exec_aten::optional<exec_aten::Tensor> weight =
-      exec_aten::optional<exec_aten::Tensor>(tfDouble.make(
+  executorch::aten::optional<executorch::aten::Tensor> weight =
+      executorch::aten::optional<executorch::aten::Tensor>(tfDouble.make(
           {4},
           {5.4100823402404785,
            3.3440847396850586,
            0.9714162349700928,
            0.6811875104904175}));
-  exec_aten::optional<exec_aten::Tensor> bias =
-      exec_aten::optional<exec_aten::Tensor>(tfDouble.make(
+  executorch::aten::optional<executorch::aten::Tensor> bias =
+      executorch::aten::optional<executorch::aten::Tensor>(tfDouble.make(
           {4},
           {6.839208126068115,
            6.471728801727295,
            3.077871799468994,
            4.0067667961120605}));
-  exec_aten::Tensor running_mean = tfDouble.make(
+  executorch::aten::Tensor running_mean = tfDouble.make(
       {4},
       {8.781468391418457,
        5.093882083892822,
        9.076446533203125,
        7.148240089416504});
-  exec_aten::Tensor running_var = tfDouble.make(
+  executorch::aten::Tensor running_var = tfDouble.make(
       {4},
       {1.0133814811706543,
        2.674386978149414,
@@ -697,10 +709,10 @@ TEST_F(OpNativeBatchNormLegitNoTrainingOutTest, SampleAtomicTestDouble) {
        9.597100257873535});
   double momentum = 0.1;
   double eps = 0;
-  exec_aten::Tensor out0 = tfDouble.zeros({3, 4, 3, 3});
-  exec_aten::Tensor out1 = tfDouble.zeros({0});
-  exec_aten::Tensor out2 = tfDouble.zeros({0});
-  exec_aten::Tensor out0_expected = tfDouble.make(
+  executorch::aten::Tensor out0 = tfDouble.zeros({3, 4, 3, 3});
+  executorch::aten::Tensor out1 = tfDouble.zeros({0});
+  executorch::aten::Tensor out2 = tfDouble.zeros({0});
+  executorch::aten::Tensor out0_expected = tfDouble.make(
       {3, 4, 3, 3},
       {-39.82401348817106,   -9.402336001242755,   -15.94316789328793,
        12.788231783114975,   -15.48431707375971,   -39.11630540562901,
@@ -738,8 +750,8 @@ TEST_F(OpNativeBatchNormLegitNoTrainingOutTest, SampleAtomicTestDouble) {
        2.829367353398184,    2.8468904728035618,   4.498860120209044,
        2.972030690911762,    2.9269570039352497,   2.792701843675069,
        2.858748044414565,    2.4459192831196264,   3.570683705559329});
-  exec_aten::Tensor out1_expected = tfDouble.make({0}, {});
-  exec_aten::Tensor out2_expected = tfDouble.make({0}, {});
+  executorch::aten::Tensor out1_expected = tfDouble.make({0}, {});
+  executorch::aten::Tensor out2_expected = tfDouble.make({0}, {});
   op_native_batch_norm_legit_no_training_out(
       input,
       weight,
@@ -757,9 +769,10 @@ TEST_F(OpNativeBatchNormLegitNoTrainingOutTest, SampleAtomicTestDouble) {
 }
 
 TEST_F(OpNativeBatchNormLegitNoTrainingOutTest, SampleAtomicTestNoWeight) {
-  torch::executor::testing::TensorFactory<exec_aten::ScalarType::Float> tfFloat;
+  torch::executor::testing::TensorFactory<executorch::aten::ScalarType::Float>
+      tfFloat;
 
-  exec_aten::Tensor input = tfFloat.make(
+  executorch::aten::Tensor input = tfFloat.make(
       {4, 7, 5}, {4.1944355964660645,  3.537543296813965,  5.067144393920898,
                   9.735533714294434,   2.661299228668213,  0.43786585330963135,
                   8.926244735717773,   8.796754837036133,  2.2966713905334473,
@@ -807,9 +820,9 @@ TEST_F(OpNativeBatchNormLegitNoTrainingOutTest, SampleAtomicTestNoWeight) {
                   6.707937240600586,   0.946076512336731,  6.623589515686035,
                   5.87992000579834,    2.196932315826416,  8.085456848144531,
                   7.774395942687988,   8.86058235168457});
-  exec_aten::optional<exec_aten::Tensor> weight;
-  exec_aten::optional<exec_aten::Tensor> bias =
-      exec_aten::optional<exec_aten::Tensor>(tfFloat.make(
+  executorch::aten::optional<executorch::aten::Tensor> weight;
+  executorch::aten::optional<executorch::aten::Tensor> bias =
+      executorch::aten::optional<executorch::aten::Tensor>(tfFloat.make(
           {7},
           {3.2798612117767334,
            7.070205211639404,
@@ -818,7 +831,7 @@ TEST_F(OpNativeBatchNormLegitNoTrainingOutTest, SampleAtomicTestNoWeight) {
            4.158933162689209,
            9.13807201385498,
            5.7105536460876465}));
-  exec_aten::Tensor running_mean = tfFloat.make(
+  executorch::aten::Tensor running_mean = tfFloat.make(
       {7},
       {8.596701622009277,
        8.133163452148438,
@@ -827,7 +840,7 @@ TEST_F(OpNativeBatchNormLegitNoTrainingOutTest, SampleAtomicTestNoWeight) {
        6.470483779907227,
        6.9614739418029785,
        5.237721920013428});
-  exec_aten::Tensor running_var = tfFloat.make(
+  executorch::aten::Tensor running_var = tfFloat.make(
       {7},
       {2.258641242980957,
        0.8535522222518921,
@@ -838,10 +851,10 @@ TEST_F(OpNativeBatchNormLegitNoTrainingOutTest, SampleAtomicTestNoWeight) {
        5.289167881011963});
   double momentum = 0.1;
   double eps = 0;
-  exec_aten::Tensor out0 = tfFloat.zeros({4, 7, 5});
-  exec_aten::Tensor out1 = tfFloat.zeros({0});
-  exec_aten::Tensor out2 = tfFloat.zeros({0});
-  exec_aten::Tensor out0_expected = tfFloat.make(
+  executorch::aten::Tensor out0 = tfFloat.zeros({4, 7, 5});
+  executorch::aten::Tensor out1 = tfFloat.zeros({0});
+  executorch::aten::Tensor out2 = tfFloat.zeros({0});
+  executorch::aten::Tensor out0_expected = tfFloat.make(
       {4, 7, 5}, {0.3506367802619934,  -0.08645286411046982, 0.9313285946846008,
                   4.037628650665283,   -0.669497013092041,   -1.259130597114563,
                   7.928630828857422,   7.788471698760986,    0.7528274059295654,
@@ -889,8 +902,8 @@ TEST_F(OpNativeBatchNormLegitNoTrainingOutTest, SampleAtomicTestNoWeight) {
                   8.805062294006348,   1.2371110916137695,   8.694275856018066,
                   5.989792346954346,   4.388367176055908,    6.94879674911499,
                   6.813542366027832,   7.285834312438965});
-  exec_aten::Tensor out1_expected = tfFloat.make({0}, {});
-  exec_aten::Tensor out2_expected = tfFloat.make({0}, {});
+  executorch::aten::Tensor out1_expected = tfFloat.make({0}, {});
+  executorch::aten::Tensor out2_expected = tfFloat.make({0}, {});
   op_native_batch_norm_legit_no_training_out(
       input,
       weight,
@@ -910,9 +923,10 @@ TEST_F(OpNativeBatchNormLegitNoTrainingOutTest, SampleAtomicTestNoWeight) {
 TEST_F(
     OpNativeBatchNormLegitNoTrainingOutTest,
     SampleAtomicTestNoWeightNoBias) {
-  torch::executor::testing::TensorFactory<exec_aten::ScalarType::Float> tfFloat;
+  torch::executor::testing::TensorFactory<executorch::aten::ScalarType::Float>
+      tfFloat;
 
-  exec_aten::Tensor input = tfFloat.make(
+  executorch::aten::Tensor input = tfFloat.make(
       {2, 4, 2, 2},
       {2.628833770751953,   7.391754150390625,  9.153281211853027,
        2.480319023132324,   6.5120697021484375, 5.680999755859375,
@@ -925,15 +939,15 @@ TEST_F(
        0.1542043685913086,  3.606675863265991,  2.65787410736084,
        5.136600494384766,   6.950716972351074,  6.051759719848633,
        7.304986953735352,   6.186429977416992});
-  exec_aten::optional<exec_aten::Tensor> weight;
-  exec_aten::optional<exec_aten::Tensor> bias;
-  exec_aten::Tensor running_mean = tfFloat.make(
+  executorch::aten::optional<executorch::aten::Tensor> weight;
+  executorch::aten::optional<executorch::aten::Tensor> bias;
+  executorch::aten::Tensor running_mean = tfFloat.make(
       {4},
       {8.043643951416016,
        3.569627285003662,
        7.6375412940979,
        4.194377899169922});
-  exec_aten::Tensor running_var = tfFloat.make(
+  executorch::aten::Tensor running_var = tfFloat.make(
       {4},
       {7.512979507446289,
        0.0478285551071167,
@@ -941,10 +955,10 @@ TEST_F(
        1.9676220417022705});
   double momentum = 0.1;
   double eps = 0;
-  exec_aten::Tensor out0 = tfFloat.zeros({2, 4, 2, 2});
-  exec_aten::Tensor out1 = tfFloat.zeros({0});
-  exec_aten::Tensor out2 = tfFloat.zeros({0});
-  exec_aten::Tensor out0_expected = tfFloat.make(
+  executorch::aten::Tensor out0 = tfFloat.zeros({2, 4, 2, 2});
+  executorch::aten::Tensor out1 = tfFloat.zeros({0});
+  executorch::aten::Tensor out2 = tfFloat.zeros({0});
+  executorch::aten::Tensor out0_expected = tfFloat.make(
       {2, 4, 2, 2},
       {-1.975500464439392,  -0.23783083260059357, 0.40483206510543823,
        -2.0296835899353027, 13.454400062561035,   9.65431022644043,
@@ -957,8 +971,8 @@ TEST_F(
        -8.03031063079834,   -4.325490474700928,   -5.343642234802246,
        -2.6837403774261475, 1.9649964570999146,   1.3241291046142578,
        2.2175559997558594,  1.4201356172561646});
-  exec_aten::Tensor out1_expected = tfFloat.make({0}, {});
-  exec_aten::Tensor out2_expected = tfFloat.make({0}, {});
+  executorch::aten::Tensor out1_expected = tfFloat.make({0}, {});
+  executorch::aten::Tensor out2_expected = tfFloat.make({0}, {});
   op_native_batch_norm_legit_no_training_out(
       input,
       weight,
@@ -976,9 +990,10 @@ TEST_F(
 }
 
 TEST_F(OpNativeBatchNormLegitOutTest, SampleAtomicTest2D) {
-  torch::executor::testing::TensorFactory<exec_aten::ScalarType::Float> tfFloat;
+  torch::executor::testing::TensorFactory<executorch::aten::ScalarType::Float>
+      tfFloat;
 
-  exec_aten::Tensor input = tfFloat.make(
+  executorch::aten::Tensor input = tfFloat.make(
       {4, 7}, {2.876736640930176,  7.67944860458374,   5.701690196990967,
                9.299789428710938,  3.023690700531006,  5.315116882324219,
                7.185585021972656,  6.911304473876953,  7.61051082611084,
@@ -989,8 +1004,8 @@ TEST_F(OpNativeBatchNormLegitOutTest, SampleAtomicTest2D) {
                9.971039772033691,  3.5423521995544434, 7.452159881591797,
                9.93700122833252,   1.8560808897018433, 1.524025797843933,
                7.3222975730896});
-  exec_aten::optional<exec_aten::Tensor> weight =
-      exec_aten::optional<exec_aten::Tensor>(tfFloat.make(
+  executorch::aten::optional<executorch::aten::Tensor> weight =
+      executorch::aten::optional<executorch::aten::Tensor>(tfFloat.make(
           {7},
           {8.287437438964844,
            8.227645874023438,
@@ -999,8 +1014,8 @@ TEST_F(OpNativeBatchNormLegitOutTest, SampleAtomicTest2D) {
            4.119281768798828,
            8.593960762023926,
            2.3760855197906494}));
-  exec_aten::optional<exec_aten::Tensor> bias =
-      exec_aten::optional<exec_aten::Tensor>(tfFloat.make(
+  executorch::aten::optional<executorch::aten::Tensor> bias =
+      executorch::aten::optional<executorch::aten::Tensor>(tfFloat.make(
           {7},
           {7.824275970458984,
            6.84327507019043,
@@ -1009,7 +1024,7 @@ TEST_F(OpNativeBatchNormLegitOutTest, SampleAtomicTest2D) {
            3.89609694480896,
            3.0753469467163086,
            3.1105971336364746}));
-  exec_aten::Tensor running_mean = tfFloat.make(
+  executorch::aten::Tensor running_mean = tfFloat.make(
       {7},
       {9.700226783752441,
        0.1234668493270874,
@@ -1018,7 +1033,7 @@ TEST_F(OpNativeBatchNormLegitOutTest, SampleAtomicTest2D) {
        0.4736626148223877,
        7.7135701179504395,
        5.12320613861084});
-  exec_aten::Tensor running_var = tfFloat.make(
+  executorch::aten::Tensor running_var = tfFloat.make(
       {7},
       {3.585531234741211,
        6.615292549133301,
@@ -1030,10 +1045,10 @@ TEST_F(OpNativeBatchNormLegitOutTest, SampleAtomicTest2D) {
   bool training = false;
   double momentum = 0.1;
   double eps = 0;
-  exec_aten::Tensor out0 = tfFloat.zeros({4, 7});
-  exec_aten::Tensor out1 = tfFloat.zeros({0});
-  exec_aten::Tensor out2 = tfFloat.zeros({0});
-  exec_aten::Tensor out0_expected = tfFloat.make(
+  executorch::aten::Tensor out0 = tfFloat.zeros({4, 7});
+  executorch::aten::Tensor out1 = tfFloat.zeros({0});
+  executorch::aten::Tensor out2 = tfFloat.zeros({0});
+  executorch::aten::Tensor out0_expected = tfFloat.make(
       {4, 7}, {-22.039867401123047, 31.014127731323242,  -16.416650772094727,
                10.04538631439209,   17.5877628326416,    -5.17673921585083,
                7.1078033447265625,  -4.381907939910889,  30.793603897094727,
@@ -1044,8 +1059,8 @@ TEST_F(OpNativeBatchNormLegitOutTest, SampleAtomicTest2D) {
                9.00953483581543,    17.779958724975586,  7.335818767547607,
                12.688335418701172,  11.318607330322266,  -18.22031593322754,
                7.372773170471191});
-  exec_aten::Tensor out1_expected = tfFloat.make({0}, {});
-  exec_aten::Tensor out2_expected = tfFloat.make({0}, {});
+  executorch::aten::Tensor out1_expected = tfFloat.make({0}, {});
+  executorch::aten::Tensor out2_expected = tfFloat.make({0}, {});
   op_native_batch_norm_legit_out(
       input,
       weight,
@@ -1063,37 +1078,40 @@ TEST_F(OpNativeBatchNormLegitOutTest, SampleAtomicTest2D) {
   EXPECT_TENSOR_CLOSE(out2, out2_expected);
 }
 
-TEST_F(OpNativeBatchNormLegitNoStatsOutTest, SampleAtomicTest2D){
-#define TEST_ENTRY(ctype, dtype) test_2d_dtype<exec_aten::ScalarType::dtype>();
-    ET_FORALL_FLOATHBF16_TYPES(TEST_ENTRY)
+TEST_F(OpNativeBatchNormLegitNoStatsOutTest, SampleAtomicTest2D) {
+#define TEST_ENTRY(ctype, dtype) \
+  test_2d_dtype<executorch::aten::ScalarType::dtype>();
+  ET_FORALL_FLOATHBF16_TYPES(TEST_ENTRY)
 #undef TEST_ENTRY
 }
 
 TEST_F(OpNativeBatchNormLegitNoStatsOutTest, SampleAtomicTest3D) {
-  torch::executor::testing::TensorFactory<exec_aten::ScalarType::Float> tfFloat;
+  torch::executor::testing::TensorFactory<executorch::aten::ScalarType::Float>
+      tfFloat;
 
-  exec_aten::Tensor input = tfFloat.make(
+  executorch::aten::Tensor input = tfFloat.make(
       {2, 3, 4}, {0,   1,   4,   9,   16,  25,  36,  49,  64,  81,  100, 121,
                   144, 169, 196, 225, 256, 289, 324, 361, 400, 441, 484, 529});
-  exec_aten::optional<exec_aten::Tensor> weight =
-      exec_aten::optional<exec_aten::Tensor>();
-  exec_aten::optional<exec_aten::Tensor> bias =
-      exec_aten::optional<exec_aten::Tensor>();
+  executorch::aten::optional<executorch::aten::Tensor> weight =
+      executorch::aten::optional<executorch::aten::Tensor>();
+  executorch::aten::optional<executorch::aten::Tensor> bias =
+      executorch::aten::optional<executorch::aten::Tensor>();
   bool training = true;
   double momentum = 1e-3;
   double eps = 1e-5;
-  exec_aten::Tensor out0 = tfFloat.zeros({2, 3, 4});
-  exec_aten::Tensor out1 = tfFloat.zeros({3});
-  exec_aten::Tensor out2 = tfFloat.zeros({3});
-  exec_aten::Tensor out0_expected = tfFloat.make(
+  executorch::aten::Tensor out0 = tfFloat.zeros({2, 3, 4});
+  executorch::aten::Tensor out1 = tfFloat.zeros({3});
+  executorch::aten::Tensor out2 = tfFloat.zeros({3});
+  executorch::aten::Tensor out0_expected = tfFloat.make(
       {2, 3, 4},
       {-1.01045656, -0.99964952, -0.96722847, -0.91319335, -1.08850884,
        -1.02468753, -0.94668359, -0.85449719, -1.12558389, -1.03595889,
        -0.93578988, -0.82507670, 0.54575467,  0.81593025,  1.10771990,
        1.42112350,  0.61339414,  0.84740579,  1.09560001,  1.35797679,
        0.64582670,  0.86198103,  1.08867943,  1.32592189});
-  exec_aten::Tensor out1_expected = tfFloat.make({3}, {93.5, 169.5, 277.5});
-  exec_aten::Tensor out2_expected =
+  executorch::aten::Tensor out1_expected =
+      tfFloat.make({3}, {93.5, 169.5, 277.5});
+  executorch::aten::Tensor out2_expected =
       tfFloat.make({3}, {0.01080702, 0.00709126, 0.00527206});
   op_native_batch_norm_legit_no_stats_out(
       input, weight, bias, training, momentum, eps, out0, out1, out2);
@@ -1103,32 +1121,34 @@ TEST_F(OpNativeBatchNormLegitNoStatsOutTest, SampleAtomicTest3D) {
 }
 
 TEST_F(OpNativeBatchNormLegitNoStatsOutTest, SampleAtomicTest4D) {
-  torch::executor::testing::TensorFactory<exec_aten::ScalarType::Float> tfFloat;
+  torch::executor::testing::TensorFactory<executorch::aten::ScalarType::Float>
+      tfFloat;
 
-  exec_aten::Tensor input =
+  executorch::aten::Tensor input =
       tfFloat.make({2, 3, 2, 2}, {0,   1,   4,   9,   16,  25,  36,  49,
                                   64,  81,  100, 121, 144, 169, 196, 225,
                                   256, 289, 324, 361, 400, 441, 484, 529});
-  exec_aten::optional<exec_aten::Tensor> weight =
-      exec_aten::optional<exec_aten::Tensor>(
+  executorch::aten::optional<executorch::aten::Tensor> weight =
+      executorch::aten::optional<executorch::aten::Tensor>(
           tfFloat.make({3}, {1.1, 0.7, 0.3}));
-  exec_aten::optional<exec_aten::Tensor> bias =
-      exec_aten::optional<exec_aten::Tensor>(
+  executorch::aten::optional<executorch::aten::Tensor> bias =
+      executorch::aten::optional<executorch::aten::Tensor>(
           tfFloat.make({3}, {1.7, 2.2, 3.3}));
   bool training = true;
   double momentum = 1e-3;
   double eps = 1e-5;
-  exec_aten::Tensor out0 = tfFloat.zeros({2, 3, 2, 2});
-  exec_aten::Tensor out1 = tfFloat.zeros({3});
-  exec_aten::Tensor out2 = tfFloat.zeros({3});
-  exec_aten::Tensor out0_expected = tfFloat.make(
+  executorch::aten::Tensor out0 = tfFloat.zeros({2, 3, 2, 2});
+  executorch::aten::Tensor out1 = tfFloat.zeros({3});
+  executorch::aten::Tensor out2 = tfFloat.zeros({3});
+  executorch::aten::Tensor out0_expected = tfFloat.make(
       {2, 3, 2, 2},
       {0.58849782, 0.60038555, 0.63604873, 0.69548732, 1.43804383, 1.48271883,
        1.53732157, 1.60185206, 2.96232486, 2.98921227, 3.01926303, 3.05247688,
        2.30033016, 2.59752321, 2.91849184, 3.26323581, 2.62937593, 2.79318404,
        2.96691990, 3.15058374, 3.49374819, 3.55859423, 3.62660384, 3.69777656});
-  exec_aten::Tensor out1_expected = tfFloat.make({3}, {93.5, 169.5, 277.5});
-  exec_aten::Tensor out2_expected =
+  executorch::aten::Tensor out1_expected =
+      tfFloat.make({3}, {93.5, 169.5, 277.5});
+  executorch::aten::Tensor out2_expected =
       tfFloat.make({3}, {0.01080702, 0.00709126, 0.00527206});
   op_native_batch_norm_legit_no_stats_out(
       input, weight, bias, training, momentum, eps, out0, out1, out2);
diff --git a/kernels/test/op_native_group_norm_test.cpp b/kernels/test/op_native_group_norm_test.cpp
index aab4d9d76a..ea742e9723 100644
--- a/kernels/test/op_native_group_norm_test.cpp
+++ b/kernels/test/op_native_group_norm_test.cpp
@@ -15,9 +15,9 @@
 #include <gtest/gtest.h>
 
 using namespace ::testing;
-using exec_aten::optional;
-using exec_aten::ScalarType;
-using exec_aten::Tensor;
+using executorch::aten::optional;
+using executorch::aten::ScalarType;
+using executorch::aten::Tensor;
 using torch::executor::testing::TensorFactory;
 
 ::std::tuple<Tensor&, Tensor&, Tensor&> op_native_group_norm_out(
diff --git a/kernels/test/op_native_layer_norm_test.cpp b/kernels/test/op_native_layer_norm_test.cpp
index 99bf15d989..7916a9130a 100644
--- a/kernels/test/op_native_layer_norm_test.cpp
+++ b/kernels/test/op_native_layer_norm_test.cpp
@@ -21,16 +21,16 @@
 #include <gtest/gtest.h>
 
 using namespace ::testing;
-using exec_aten::ArrayRef;
-using exec_aten::IntArrayRef;
-using exec_aten::nullopt;
-using exec_aten::optional;
-using exec_aten::Scalar;
-using exec_aten::ScalarType;
-using exec_aten::Tensor;
+using executorch::aten::ArrayRef;
+using executorch::aten::IntArrayRef;
+using executorch::aten::nullopt;
+using executorch::aten::optional;
+using executorch::aten::Scalar;
+using executorch::aten::ScalarType;
+using executorch::aten::Tensor;
 using torch::executor::testing::TensorFactory;
 
-using OptScalar = exec_aten::optional<Scalar>;
+using OptScalar = executorch::aten::optional<Scalar>;
 
 class OpNativeLayerNormTest : public OperatorTest {
  protected:
@@ -88,7 +88,7 @@ class OpNativeLayerNormTest : public OperatorTest {
           test_case.sizes, torch::executor::TensorShapeDynamism::DYNAMIC_BOUND);
       auto normalized_shape_vec = std::vector<int64_t>(
           test_case.normalized_shape.begin(), test_case.normalized_shape.end());
-      auto normalized_shape = exec_aten::ArrayRef<int64_t>(
+      auto normalized_shape = executorch::aten::ArrayRef<int64_t>(
           normalized_shape_vec.data(), normalized_shape_vec.size());
       auto result = op_native_layer_norm_out(
           in, normalized_shape, weight, bias, test_case.eps, out0, out1, out2);
@@ -249,7 +249,7 @@ class OpNativeLayerNormTest : public OperatorTest {
       SCOPED_TRACE(test_case.title); // Printed if the test fails
 
       Tensor in = tf.make(test_case.sizes, test_case.input_data);
-      exec_aten::optional<Tensor> weight, bias;
+      executorch::aten::optional<Tensor> weight, bias;
       if (!test_case.weight_data.empty()) {
         weight = tf.make(test_case.normalized_shape, test_case.weight_data);
       }
@@ -261,7 +261,7 @@ class OpNativeLayerNormTest : public OperatorTest {
       Tensor out2 = tf.zeros(test_case.sizes);
       auto normalized_shape_vec = std::vector<int64_t>(
           test_case.normalized_shape.begin(), test_case.normalized_shape.end());
-      auto normalized_shape = exec_aten::ArrayRef<int64_t>(
+      auto normalized_shape = executorch::aten::ArrayRef<int64_t>(
           normalized_shape_vec.data(), normalized_shape_vec.size());
       ET_EXPECT_KERNEL_FAILURE(
           context_,
diff --git a/kernels/test/op_ne_test.cpp b/kernels/test/op_ne_test.cpp
index 81ec9d01fc..6cb0217ec0 100644
--- a/kernels/test/op_ne_test.cpp
+++ b/kernels/test/op_ne_test.cpp
@@ -15,9 +15,9 @@
 #include <gtest/gtest.h>
 
 using namespace ::testing;
-using exec_aten::Scalar;
-using exec_aten::ScalarType;
-using exec_aten::Tensor;
+using executorch::aten::Scalar;
+using executorch::aten::ScalarType;
+using executorch::aten::Tensor;
 using executorch::runtime::KernelRuntimeContext;
 using torch::executor::testing::TensorFactory;
 
diff --git a/kernels/test/op_neg_test.cpp b/kernels/test/op_neg_test.cpp
index a87b9feca1..63389c79b2 100644
--- a/kernels/test/op_neg_test.cpp
+++ b/kernels/test/op_neg_test.cpp
@@ -15,8 +15,8 @@
 #include <gtest/gtest.h>
 
 using namespace ::testing;
-using exec_aten::ScalarType;
-using exec_aten::Tensor;
+using executorch::aten::ScalarType;
+using executorch::aten::Tensor;
 using torch::executor::testing::TensorFactory;
 
 class OpNegTest : public OperatorTest {
diff --git a/kernels/test/op_nonzero_test.cpp b/kernels/test/op_nonzero_test.cpp
index c1948a439b..7eeeda22a0 100644
--- a/kernels/test/op_nonzero_test.cpp
+++ b/kernels/test/op_nonzero_test.cpp
@@ -14,8 +14,8 @@
 #include <gtest/gtest.h>
 
 using namespace ::testing;
-using exec_aten::ScalarType;
-using exec_aten::Tensor;
+using executorch::aten::ScalarType;
+using executorch::aten::Tensor;
 using torch::executor::testing::TensorFactory;
 
 class OpNonzeroTest : public OperatorTest {
@@ -24,7 +24,7 @@ class OpNonzeroTest : public OperatorTest {
     return torch::executor::aten::nonzero_outf(context_, self, out);
   }
 
-  template <class CTYPE, exec_aten::ScalarType DTYPE>
+  template <class CTYPE, executorch::aten::ScalarType DTYPE>
   void test_dtype() {
     TensorFactory<DTYPE> tf_input;
     TensorFactory<ScalarType::Long> tf_long;
diff --git a/kernels/test/op_ones_test.cpp b/kernels/test/op_ones_test.cpp
index c83771fb49..48598ec468 100644
--- a/kernels/test/op_ones_test.cpp
+++ b/kernels/test/op_ones_test.cpp
@@ -15,9 +15,9 @@
 #include <gtest/gtest.h>
 
 using namespace ::testing;
-using exec_aten::IntArrayRef;
-using exec_aten::ScalarType;
-using exec_aten::Tensor;
+using executorch::aten::IntArrayRef;
+using executorch::aten::ScalarType;
+using executorch::aten::Tensor;
 using torch::executor::testing::TensorFactory;
 
 class OpOnesOutTest : public OperatorTest {
diff --git a/kernels/test/op_pdist_forward_test.cpp b/kernels/test/op_pdist_forward_test.cpp
index 2f1e86bf3f..e6c0d47251 100644
--- a/kernels/test/op_pdist_forward_test.cpp
+++ b/kernels/test/op_pdist_forward_test.cpp
@@ -17,9 +17,9 @@
 #include <gtest/gtest.h>
 
 using namespace ::testing;
-using exec_aten::ArrayRef;
-using exec_aten::ScalarType;
-using exec_aten::Tensor;
+using executorch::aten::ArrayRef;
+using executorch::aten::ScalarType;
+using executorch::aten::Tensor;
 using torch::executor::testing::TensorFactory;
 
 Tensor& op_pdist_forward_out(const Tensor& input, double p, Tensor& out) {
diff --git a/kernels/test/op_permute_copy_test.cpp b/kernels/test/op_permute_copy_test.cpp
index b2f12115c6..bb7b186a67 100644
--- a/kernels/test/op_permute_copy_test.cpp
+++ b/kernels/test/op_permute_copy_test.cpp
@@ -16,10 +16,10 @@
 #include <gtest/gtest.h>
 
 using namespace ::testing;
-using exec_aten::ArrayRef;
-using exec_aten::IntArrayRef;
-using exec_aten::ScalarType;
-using exec_aten::Tensor;
+using executorch::aten::ArrayRef;
+using executorch::aten::IntArrayRef;
+using executorch::aten::ScalarType;
+using executorch::aten::Tensor;
 using torch::executor::testing::TensorFactory;
 
 class OpPermuteCopyTest : public OperatorTest {
diff --git a/kernels/test/op_pixel_shuffle_test.cpp b/kernels/test/op_pixel_shuffle_test.cpp
index 9a111a106f..1734752169 100644
--- a/kernels/test/op_pixel_shuffle_test.cpp
+++ b/kernels/test/op_pixel_shuffle_test.cpp
@@ -16,9 +16,9 @@
 #include <gtest/gtest.h>
 
 using namespace ::testing;
-using exec_aten::Scalar;
-using exec_aten::ScalarType;
-using exec_aten::Tensor;
+using executorch::aten::Scalar;
+using executorch::aten::ScalarType;
+using executorch::aten::Tensor;
 using torch::executor::testing::SupportedFeatures;
 using torch::executor::testing::TensorFactory;
 
diff --git a/kernels/test/op_pixel_unshuffle_test.cpp b/kernels/test/op_pixel_unshuffle_test.cpp
index 94fc11fa55..5dec9285a7 100644
--- a/kernels/test/op_pixel_unshuffle_test.cpp
+++ b/kernels/test/op_pixel_unshuffle_test.cpp
@@ -16,9 +16,9 @@
 #include <gtest/gtest.h>
 
 using namespace ::testing;
-using exec_aten::Scalar;
-using exec_aten::ScalarType;
-using exec_aten::Tensor;
+using executorch::aten::Scalar;
+using executorch::aten::ScalarType;
+using executorch::aten::Tensor;
 using torch::executor::testing::SupportedFeatures;
 using torch::executor::testing::TensorFactory;
 
diff --git a/kernels/test/op_pow_test.cpp b/kernels/test/op_pow_test.cpp
index 8553125c7d..f9234a748b 100644
--- a/kernels/test/op_pow_test.cpp
+++ b/kernels/test/op_pow_test.cpp
@@ -15,9 +15,9 @@
 #include <gtest/gtest.h>
 
 using namespace ::testing;
-using exec_aten::Scalar;
-using exec_aten::ScalarType;
-using exec_aten::Tensor;
+using executorch::aten::Scalar;
+using executorch::aten::ScalarType;
+using executorch::aten::Tensor;
 using torch::executor::testing::TensorFactory;
 
 class OpPowTest : public OperatorTest {
diff --git a/kernels/test/op_prod_test.cpp b/kernels/test/op_prod_test.cpp
index a774bc564c..f9cf53ded5 100644
--- a/kernels/test/op_prod_test.cpp
+++ b/kernels/test/op_prod_test.cpp
@@ -16,9 +16,9 @@
 #include <gtest/gtest.h>
 
 using namespace ::testing;
-using exec_aten::optional;
-using exec_aten::ScalarType;
-using exec_aten::Tensor;
+using executorch::aten::optional;
+using executorch::aten::ScalarType;
+using executorch::aten::Tensor;
 using torch::executor::testing::TensorFactory;
 
 Tensor&
diff --git a/kernels/test/op_reciprocal_test.cpp b/kernels/test/op_reciprocal_test.cpp
index 5835b8c91f..8c5aafa8b0 100644
--- a/kernels/test/op_reciprocal_test.cpp
+++ b/kernels/test/op_reciprocal_test.cpp
@@ -13,7 +13,7 @@
 
 #include <cmath>
 
-using exec_aten::Tensor;
+using executorch::aten::Tensor;
 class OpReciprocalOutTest
     : public torch::executor::testing::UnaryUfuncRealHBBF16ToFloatHBF16Test {
  protected:
diff --git a/kernels/test/op_reflection_pad1d_test.cpp b/kernels/test/op_reflection_pad1d_test.cpp
index 2c357ffe38..5f3b2a1c27 100644
--- a/kernels/test/op_reflection_pad1d_test.cpp
+++ b/kernels/test/op_reflection_pad1d_test.cpp
@@ -16,9 +16,9 @@
 #include <gtest/gtest.h>
 
 using namespace ::testing;
-using exec_aten::ArrayRef;
-using exec_aten::ScalarType;
-using exec_aten::Tensor;
+using executorch::aten::ArrayRef;
+using executorch::aten::ScalarType;
+using executorch::aten::Tensor;
 using torch::executor::testing::TensorFactory;
 
 Tensor& op_reflection_pad1d_out(
diff --git a/kernels/test/op_reflection_pad2d_test.cpp b/kernels/test/op_reflection_pad2d_test.cpp
index 6e0c7780f5..8696b5dff7 100644
--- a/kernels/test/op_reflection_pad2d_test.cpp
+++ b/kernels/test/op_reflection_pad2d_test.cpp
@@ -16,9 +16,9 @@
 #include <gtest/gtest.h>
 
 using namespace ::testing;
-using exec_aten::ArrayRef;
-using exec_aten::ScalarType;
-using exec_aten::Tensor;
+using executorch::aten::ArrayRef;
+using executorch::aten::ScalarType;
+using executorch::aten::Tensor;
 using torch::executor::testing::TensorFactory;
 
 Tensor& op_reflection_pad2d_out(
diff --git a/kernels/test/op_reflection_pad3d_test.cpp b/kernels/test/op_reflection_pad3d_test.cpp
index 8ef8b6154d..7d5cc84c6b 100644
--- a/kernels/test/op_reflection_pad3d_test.cpp
+++ b/kernels/test/op_reflection_pad3d_test.cpp
@@ -16,9 +16,9 @@
 #include <gtest/gtest.h>
 
 using namespace ::testing;
-using exec_aten::ArrayRef;
-using exec_aten::ScalarType;
-using exec_aten::Tensor;
+using executorch::aten::ArrayRef;
+using executorch::aten::ScalarType;
+using executorch::aten::Tensor;
 using torch::executor::testing::TensorFactory;
 
 Tensor& op_reflection_pad3d_out(
diff --git a/kernels/test/op_relu_test.cpp b/kernels/test/op_relu_test.cpp
index 2370904689..7d3cfc696b 100644
--- a/kernels/test/op_relu_test.cpp
+++ b/kernels/test/op_relu_test.cpp
@@ -15,8 +15,8 @@
 #include <gtest/gtest.h>
 
 using namespace ::testing;
-using exec_aten::ScalarType;
-using exec_aten::Tensor;
+using executorch::aten::ScalarType;
+using executorch::aten::Tensor;
 using torch::executor::testing::TensorFactory;
 
 class OpReluTest : public OperatorTest {
diff --git a/kernels/test/op_remainder_test.cpp b/kernels/test/op_remainder_test.cpp
index 254e8122b6..c999f97080 100644
--- a/kernels/test/op_remainder_test.cpp
+++ b/kernels/test/op_remainder_test.cpp
@@ -15,9 +15,9 @@
 #include <gtest/gtest.h>
 
 using namespace ::testing;
-using exec_aten::Scalar;
-using exec_aten::ScalarType;
-using exec_aten::Tensor;
+using executorch::aten::Scalar;
+using executorch::aten::ScalarType;
+using executorch::aten::Tensor;
 using torch::executor::testing::TensorFactory;
 
 class OpRemainderOutTest : public OperatorTest {
diff --git a/kernels/test/op_repeat_interleave_test.cpp b/kernels/test/op_repeat_interleave_test.cpp
index c4056737be..a54f6b1c9a 100644
--- a/kernels/test/op_repeat_interleave_test.cpp
+++ b/kernels/test/op_repeat_interleave_test.cpp
@@ -13,9 +13,9 @@
 #include <executorch/runtime/core/exec_aten/testing_util/tensor_util.h>
 
 using namespace ::testing;
-using exec_aten::optional;
-using exec_aten::ScalarType;
-using exec_aten::Tensor;
+using executorch::aten::optional;
+using executorch::aten::ScalarType;
+using executorch::aten::Tensor;
 using torch::executor::testing::TensorFactory;
 
 class OpRepeatInterleaveTensorOutTest : public OperatorTest {
diff --git a/kernels/test/op_repeat_test.cpp b/kernels/test/op_repeat_test.cpp
index 11c90e6f4c..5a04044d34 100644
--- a/kernels/test/op_repeat_test.cpp
+++ b/kernels/test/op_repeat_test.cpp
@@ -15,10 +15,10 @@
 #include <executorch/runtime/core/exec_aten/util/scalar_type_util.h>
 
 using namespace ::testing;
-using exec_aten::ArrayRef;
-using exec_aten::IntArrayRef;
-using exec_aten::ScalarType;
-using exec_aten::Tensor;
+using executorch::aten::ArrayRef;
+using executorch::aten::IntArrayRef;
+using executorch::aten::ScalarType;
+using executorch::aten::Tensor;
 using torch::executor::testing::TensorFactory;
 
 class OpRepeatOutTest : public OperatorTest {
@@ -38,7 +38,7 @@ class OpRepeatOutTest : public OperatorTest {
                     2, 3,
                   });
     std::vector<int64_t> repeats_vec = {3, 3, 3};
-    exec_aten::ArrayRef<int64_t> repeats = {repeats_vec.data(), repeats_vec.size()};
+    executorch::aten::ArrayRef<int64_t> repeats = {repeats_vec.data(), repeats_vec.size()};
     // clang-format on
 
     // Output tensor with the shape of the input tensor x repeated
@@ -102,7 +102,7 @@ TEST_F(OpRepeatOutTest, EmptyInputSupported) {
       /*sizes=*/{3, 0, 2}, /*data=*/{});
 
   std::vector<int64_t> repeats_vec = {3, 4, 5, 6};
-  exec_aten::ArrayRef<int64_t> repeats = {
+  executorch::aten::ArrayRef<int64_t> repeats = {
       repeats_vec.data(), repeats_vec.size()};
 
   Tensor out = tf.ones(/*sizes=*/{3, 12, 0, 12});
@@ -120,7 +120,7 @@ TEST_F(OpRepeatOutTest, ZeroDimInputSupported) {
       /*sizes=*/{}, /*data=*/{5});
 
   std::vector<int64_t> repeats_vec = {3, 4};
-  exec_aten::ArrayRef<int64_t> repeats = {
+  executorch::aten::ArrayRef<int64_t> repeats = {
       repeats_vec.data(), repeats_vec.size()};
 
   Tensor out = tf.ones(/*sizes=*/{3, 4});
@@ -147,7 +147,7 @@ TEST_F(OpRepeatOutTest, ZeroRepeatRegularInputSupported) {
       /*sizes=*/{3, 2}, /*data=*/{0, 1, 2, 3, 4, 5});
 
   std::vector<int64_t> repeats_vec = {3, 0, 6};
-  exec_aten::ArrayRef<int64_t> repeats = {
+  executorch::aten::ArrayRef<int64_t> repeats = {
       repeats_vec.data(), repeats_vec.size()};
 
   Tensor out = tf.ones(/*sizes=*/{3, 0, 12});
@@ -165,7 +165,7 @@ TEST_F(OpRepeatOutTest, ZeroRepeatZeroDimInputSupported) {
       /*sizes=*/{}, /*data=*/{5});
 
   std::vector<int64_t> repeats_vec = {3, 0, 6};
-  exec_aten::ArrayRef<int64_t> repeats = {
+  executorch::aten::ArrayRef<int64_t> repeats = {
       repeats_vec.data(), repeats_vec.size()};
 
   Tensor out = tf.ones(/*sizes=*/{3, 0, 6});
@@ -184,7 +184,7 @@ TEST_F(OpRepeatOutTest, RepeatTooShortDie) {
 
   // The length of repeat vector shall not be shorter than x.dim().
   std::vector<int64_t> repeats_vec = {3};
-  exec_aten::ArrayRef<int64_t> repeats = {
+  executorch::aten::ArrayRef<int64_t> repeats = {
       repeats_vec.data(), repeats_vec.size()};
 
   Tensor out = tf.ones(/*sizes=*/{3, 0, 12});
@@ -200,7 +200,7 @@ TEST_F(OpRepeatOutTest, NegativeRepeatDie) {
 
   // Try to create tensor with negative shape, die.
   std::vector<int64_t> repeats_vec = {3, -1};
-  exec_aten::ArrayRef<int64_t> repeats = {
+  executorch::aten::ArrayRef<int64_t> repeats = {
       repeats_vec.data(), repeats_vec.size()};
 
   Tensor out = tf.ones(/*sizes=*/{3, 1});
@@ -218,7 +218,7 @@ TEST_F(OpRepeatOutTest, WrongOutputShapeDie) {
       /*sizes=*/{3, 2});
 
   std::vector<int64_t> repeats_vec = {3, 5, 6};
-  exec_aten::ArrayRef<int64_t> repeats = {
+  executorch::aten::ArrayRef<int64_t> repeats = {
       repeats_vec.data(), repeats_vec.size()};
 
   // The size of output shall be [3, 15, 12].
@@ -235,7 +235,7 @@ TEST_F(OpRepeatOutTest, OutputDtypeMismatchedDie) {
       /*sizes=*/{3, 3});
 
   std::vector<int64_t> repeats_vec = {7, 5, 6};
-  exec_aten::ArrayRef<int64_t> repeats = {
+  executorch::aten::ArrayRef<int64_t> repeats = {
       repeats_vec.data(), repeats_vec.size()};
 
   Tensor out = tf_out.ones(/*sizes=*/{7, 15, 18});
@@ -255,7 +255,7 @@ TEST_F(OpRepeatOutTest, TooManyDimensionsDies) {
       /*sizes=*/{3, 2});
 
   auto repeats_vec = std::vector<int64_t>(17, 1);
-  exec_aten::ArrayRef<int64_t> repeats = {
+  executorch::aten::ArrayRef<int64_t> repeats = {
       repeats_vec.data(), repeats_vec.size()};
 
   // The size of output shall be [1, 1, .. total 15 * 1 .. , 1, 3, 2].
@@ -278,7 +278,7 @@ TEST_F(OpRepeatOutTest, UpperBoundOutTensor) {
                   2, 3,
                 });
   std::vector<int64_t> repeats_vec = {3, 3, 3};
-  exec_aten::ArrayRef<int64_t> repeats = {repeats_vec.data(), repeats_vec.size()};
+  executorch::aten::ArrayRef<int64_t> repeats = {repeats_vec.data(), repeats_vec.size()};
   // clang-format on
 
   // Output tensor with the shape of the input tensor x repeated
diff --git a/kernels/test/op_replication_pad1d_test.cpp b/kernels/test/op_replication_pad1d_test.cpp
index 942a38e27b..9a6d3b2285 100644
--- a/kernels/test/op_replication_pad1d_test.cpp
+++ b/kernels/test/op_replication_pad1d_test.cpp
@@ -16,9 +16,9 @@
 #include <gtest/gtest.h>
 
 using namespace ::testing;
-using exec_aten::ArrayRef;
-using exec_aten::ScalarType;
-using exec_aten::Tensor;
+using executorch::aten::ArrayRef;
+using executorch::aten::ScalarType;
+using executorch::aten::Tensor;
 using torch::executor::testing::TensorFactory;
 
 Tensor& op_replication_pad1d_out(
diff --git a/kernels/test/op_replication_pad2d_test.cpp b/kernels/test/op_replication_pad2d_test.cpp
index 2b9147b575..00bc76ac09 100644
--- a/kernels/test/op_replication_pad2d_test.cpp
+++ b/kernels/test/op_replication_pad2d_test.cpp
@@ -16,9 +16,9 @@
 #include <gtest/gtest.h>
 
 using namespace ::testing;
-using exec_aten::ArrayRef;
-using exec_aten::ScalarType;
-using exec_aten::Tensor;
+using executorch::aten::ArrayRef;
+using executorch::aten::ScalarType;
+using executorch::aten::Tensor;
 using torch::executor::testing::TensorFactory;
 
 Tensor& op_replication_pad2d_out(
diff --git a/kernels/test/op_replication_pad3d_test.cpp b/kernels/test/op_replication_pad3d_test.cpp
index 61a23ffbab..010870298d 100644
--- a/kernels/test/op_replication_pad3d_test.cpp
+++ b/kernels/test/op_replication_pad3d_test.cpp
@@ -16,9 +16,9 @@
 #include <gtest/gtest.h>
 
 using namespace ::testing;
-using exec_aten::ArrayRef;
-using exec_aten::ScalarType;
-using exec_aten::Tensor;
+using executorch::aten::ArrayRef;
+using executorch::aten::ScalarType;
+using executorch::aten::Tensor;
 using torch::executor::testing::TensorFactory;
 
 Tensor& op_replication_pad3d_out(
diff --git a/kernels/test/op_roll_test.cpp b/kernels/test/op_roll_test.cpp
index e6cf2c43e1..fc5baaad4a 100644
--- a/kernels/test/op_roll_test.cpp
+++ b/kernels/test/op_roll_test.cpp
@@ -16,9 +16,9 @@
 #include <gtest/gtest.h>
 
 using namespace ::testing;
-using exec_aten::ArrayRef;
-using exec_aten::ScalarType;
-using exec_aten::Tensor;
+using executorch::aten::ArrayRef;
+using executorch::aten::ScalarType;
+using executorch::aten::Tensor;
 using torch::executor::testing::TensorFactory;
 
 Tensor& op_roll_out(
diff --git a/kernels/test/op_round_test.cpp b/kernels/test/op_round_test.cpp
index 71fda4a50d..e05f3a68d4 100644
--- a/kernels/test/op_round_test.cpp
+++ b/kernels/test/op_round_test.cpp
@@ -16,8 +16,8 @@
 #include <gtest/gtest.h>
 
 using namespace ::testing;
-using exec_aten::ScalarType;
-using exec_aten::Tensor;
+using executorch::aten::ScalarType;
+using executorch::aten::Tensor;
 using torch::executor::testing::TensorFactory;
 
 class OpRoundTest : public OperatorTest {
diff --git a/kernels/test/op_rsqrt_test.cpp b/kernels/test/op_rsqrt_test.cpp
index c52eeb50c6..97c4e20ae1 100644
--- a/kernels/test/op_rsqrt_test.cpp
+++ b/kernels/test/op_rsqrt_test.cpp
@@ -13,7 +13,7 @@
 
 #include <cmath>
 
-using exec_aten::Tensor;
+using executorch::aten::Tensor;
 class OpRsqrtOutTest
     : public torch::executor::testing::UnaryUfuncRealHBBF16ToFloatHBF16Test {
  protected:
diff --git a/kernels/test/op_rsub_test.cpp b/kernels/test/op_rsub_test.cpp
index 5c1ef76628..f3fa5eedf9 100644
--- a/kernels/test/op_rsub_test.cpp
+++ b/kernels/test/op_rsub_test.cpp
@@ -16,9 +16,9 @@
 #include <gtest/gtest.h>
 
 using namespace ::testing;
-using exec_aten::Scalar;
-using exec_aten::ScalarType;
-using exec_aten::Tensor;
+using executorch::aten::Scalar;
+using executorch::aten::ScalarType;
+using executorch::aten::Tensor;
 using torch::executor::testing::SupportedFeatures;
 using torch::executor::testing::TensorFactory;
 
diff --git a/kernels/test/op_scalar_tensor_test.cpp b/kernels/test/op_scalar_tensor_test.cpp
index 482f6073a6..db4816e884 100644
--- a/kernels/test/op_scalar_tensor_test.cpp
+++ b/kernels/test/op_scalar_tensor_test.cpp
@@ -17,10 +17,10 @@
 #include <gtest/gtest.h>
 
 using namespace ::testing;
-using exec_aten::IntArrayRef;
-using exec_aten::Scalar;
-using exec_aten::ScalarType;
-using exec_aten::Tensor;
+using executorch::aten::IntArrayRef;
+using executorch::aten::Scalar;
+using executorch::aten::ScalarType;
+using executorch::aten::Tensor;
 using torch::executor::testing::TensorFactory;
 
 class OpScalarTensorOutTest : public OperatorTest {
diff --git a/kernels/test/op_scatter_add_test.cpp b/kernels/test/op_scatter_add_test.cpp
index d03d77715e..82a5e85813 100644
--- a/kernels/test/op_scatter_add_test.cpp
+++ b/kernels/test/op_scatter_add_test.cpp
@@ -17,9 +17,9 @@
 #include <cmath>
 
 using namespace ::testing;
-using exec_aten::Scalar;
-using exec_aten::ScalarType;
-using exec_aten::Tensor;
+using executorch::aten::Scalar;
+using executorch::aten::ScalarType;
+using executorch::aten::Tensor;
 using torch::executor::testing::TensorFactory;
 
 class OpScatterAddOutTest : public OperatorTest {
diff --git a/kernels/test/op_scatter_test.cpp b/kernels/test/op_scatter_test.cpp
index 7c7bc862be..0e55aadaed 100644
--- a/kernels/test/op_scatter_test.cpp
+++ b/kernels/test/op_scatter_test.cpp
@@ -17,9 +17,9 @@
 #include <cmath>
 
 using namespace ::testing;
-using exec_aten::Scalar;
-using exec_aten::ScalarType;
-using exec_aten::Tensor;
+using executorch::aten::Scalar;
+using executorch::aten::ScalarType;
+using executorch::aten::Tensor;
 using torch::executor::testing::TensorFactory;
 
 class OpScatterSrcOutTest : public OperatorTest {
diff --git a/kernels/test/op_select_copy_test.cpp b/kernels/test/op_select_copy_test.cpp
index 536b6058ce..c78c09628c 100644
--- a/kernels/test/op_select_copy_test.cpp
+++ b/kernels/test/op_select_copy_test.cpp
@@ -18,9 +18,9 @@
 #include <sys/types.h>
 
 using namespace ::testing;
-using exec_aten::ArrayRef;
-using exec_aten::ScalarType;
-using exec_aten::Tensor;
+using executorch::aten::ArrayRef;
+using executorch::aten::ScalarType;
+using executorch::aten::Tensor;
 using torch::executor::testing::TensorFactory;
 
 class OpSelectCopyIntOutTest : public OperatorTest {
@@ -34,7 +34,7 @@ class OpSelectCopyIntOutTest : public OperatorTest {
         context_, self, dim, index, out);
   }
 
-  template <class CTYPE, exec_aten::ScalarType DTYPE>
+  template <class CTYPE, executorch::aten::ScalarType DTYPE>
   void test_dtype() {
     TensorFactory<DTYPE> tf;
 
diff --git a/kernels/test/op_select_scatter_test.cpp b/kernels/test/op_select_scatter_test.cpp
index ab465a8a36..185331d399 100644
--- a/kernels/test/op_select_scatter_test.cpp
+++ b/kernels/test/op_select_scatter_test.cpp
@@ -18,9 +18,9 @@
 #include <sys/types.h>
 
 using namespace ::testing;
-using exec_aten::ArrayRef;
-using exec_aten::ScalarType;
-using exec_aten::Tensor;
+using executorch::aten::ArrayRef;
+using executorch::aten::ScalarType;
+using executorch::aten::Tensor;
 using torch::executor::testing::TensorFactory;
 
 class OpSelectScatterOutTest : public OperatorTest {
@@ -35,7 +35,7 @@ class OpSelectScatterOutTest : public OperatorTest {
         context_, self, src, dim, index, out);
   }
 
-  template <class CTYPE, exec_aten::ScalarType DTYPE>
+  template <class CTYPE, executorch::aten::ScalarType DTYPE>
   void test_dtype() {
     TensorFactory<DTYPE> tf;
 
diff --git a/kernels/test/op_sigmoid_test.cpp b/kernels/test/op_sigmoid_test.cpp
index d7856b7533..550cebda31 100644
--- a/kernels/test/op_sigmoid_test.cpp
+++ b/kernels/test/op_sigmoid_test.cpp
@@ -16,8 +16,8 @@
 #include <gtest/gtest.h>
 
 using namespace ::testing;
-using exec_aten::ScalarType;
-using exec_aten::Tensor;
+using executorch::aten::ScalarType;
+using executorch::aten::Tensor;
 using torch::executor::testing::SupportedFeatures;
 using torch::executor::testing::TensorFactory;
 
diff --git a/kernels/test/op_sign_test.cpp b/kernels/test/op_sign_test.cpp
index 8e7fd8b453..2754e78492 100644
--- a/kernels/test/op_sign_test.cpp
+++ b/kernels/test/op_sign_test.cpp
@@ -16,8 +16,8 @@
 #include <gtest/gtest.h>
 
 using namespace ::testing;
-using exec_aten::ScalarType;
-using exec_aten::Tensor;
+using executorch::aten::ScalarType;
+using executorch::aten::Tensor;
 using torch::executor::testing::TensorFactory;
 
 class OpSignTest : public OperatorTest {
diff --git a/kernels/test/op_sin_test.cpp b/kernels/test/op_sin_test.cpp
index e2c883f489..3a85b750eb 100644
--- a/kernels/test/op_sin_test.cpp
+++ b/kernels/test/op_sin_test.cpp
@@ -13,7 +13,7 @@
 
 #include <cmath>
 
-using exec_aten::Tensor;
+using executorch::aten::Tensor;
 class OpSinOutTest
     : public torch::executor::testing::UnaryUfuncRealHBBF16ToFloatHBF16Test {
  protected:
diff --git a/kernels/test/op_sinh_test.cpp b/kernels/test/op_sinh_test.cpp
index a9c41bc3fc..77f249eaa0 100644
--- a/kernels/test/op_sinh_test.cpp
+++ b/kernels/test/op_sinh_test.cpp
@@ -13,7 +13,7 @@
 
 #include <cmath>
 
-using exec_aten::Tensor;
+using executorch::aten::Tensor;
 class OpSinhOutTest
     : public torch::executor::testing::UnaryUfuncRealHBBF16ToFloatHBF16Test {
  protected:
diff --git a/kernels/test/op_slice_copy_test.cpp b/kernels/test/op_slice_copy_test.cpp
index 9aaf6f18db..8c77d6415f 100644
--- a/kernels/test/op_slice_copy_test.cpp
+++ b/kernels/test/op_slice_copy_test.cpp
@@ -17,10 +17,10 @@
 #include <gtest/gtest.h>
 
 using namespace ::testing;
-using exec_aten::ArrayRef;
-using exec_aten::optional;
-using exec_aten::ScalarType;
-using exec_aten::Tensor;
+using executorch::aten::ArrayRef;
+using executorch::aten::optional;
+using executorch::aten::ScalarType;
+using executorch::aten::Tensor;
 using torch::executor::testing::TensorFactory;
 
 class OpSliceCopyTensorOutTest : public OperatorTest {
@@ -36,7 +36,7 @@ class OpSliceCopyTensorOutTest : public OperatorTest {
         context_, self, dim, start, end, step, out);
   }
 
-  template <class CTYPE, exec_aten::ScalarType DTYPE>
+  template <class CTYPE, executorch::aten::ScalarType DTYPE>
   void test_dtype() {
     TensorFactory<DTYPE> tf;
 
@@ -568,7 +568,7 @@ TEST_F(OpSliceCopyTensorOutTest, DefaultStartValSupported) {
   Tensor ret_default_start = op_slice_copy_tensor_out(
       input,
       /*dim=*/0,
-      /*start=*/exec_aten::nullopt,
+      /*start=*/executorch::aten::nullopt,
       /*end=*/2,
       /*step=*/1,
       out);
@@ -588,7 +588,7 @@ TEST_F(OpSliceCopyTensorOutTest, DefaultEndValSupported) {
       input,
       /*dim=*/0,
       /*start=*/0,
-      /*end=*/exec_aten::nullopt,
+      /*end=*/executorch::aten::nullopt,
       /*step=*/1,
       out);
   EXPECT_TENSOR_EQ(ret_default_end, out);
diff --git a/kernels/test/op_slice_scatter_test.cpp b/kernels/test/op_slice_scatter_test.cpp
index 1d5e972ef2..9d406cf968 100644
--- a/kernels/test/op_slice_scatter_test.cpp
+++ b/kernels/test/op_slice_scatter_test.cpp
@@ -17,10 +17,10 @@
 #include <gtest/gtest.h>
 
 using namespace ::testing;
-using exec_aten::ArrayRef;
-using exec_aten::optional;
-using exec_aten::ScalarType;
-using exec_aten::Tensor;
+using executorch::aten::ArrayRef;
+using executorch::aten::optional;
+using executorch::aten::ScalarType;
+using executorch::aten::Tensor;
 using torch::executor::testing::TensorFactory;
 
 class OpSliceScatterTensorOutTest : public OperatorTest {
@@ -37,7 +37,7 @@ class OpSliceScatterTensorOutTest : public OperatorTest {
         context_, self, src, dim, start, end, step, out);
   }
 
-  template <class CTYPE, exec_aten::ScalarType DTYPE>
+  template <class CTYPE, executorch::aten::ScalarType DTYPE>
   void test_dtype() {
     TensorFactory<DTYPE> tf;
 
@@ -813,7 +813,7 @@ TEST_F(OpSliceScatterTensorOutTest, DefaultStartValSupported) {
       input,
       src,
       /*dim=*/0,
-      /*start=*/exec_aten::nullopt,
+      /*start=*/executorch::aten::nullopt,
       /*end=*/2,
       /*step=*/1,
       out);
@@ -835,7 +835,7 @@ TEST_F(OpSliceScatterTensorOutTest, DefaultEndValSupported) {
       src,
       /*dim=*/0,
       /*start=*/0,
-      /*end=*/exec_aten::nullopt,
+      /*end=*/executorch::aten::nullopt,
       /*step=*/1,
       out);
   EXPECT_TENSOR_EQ(ret_default_end, out);
@@ -857,7 +857,7 @@ TEST_F(OpSliceScatterTensorOutTest, DynamicShapeTest) {
       src,
       /*dim=*/0,
       /*start=*/0,
-      /*end=*/exec_aten::nullopt,
+      /*end=*/executorch::aten::nullopt,
       /*step=*/1,
       out);
   EXPECT_TENSOR_EQ(ret_default_end, out);
diff --git a/kernels/test/op_softmax_test.cpp b/kernels/test/op_softmax_test.cpp
index 65c5fbb182..a5d26d0a4f 100644
--- a/kernels/test/op_softmax_test.cpp
+++ b/kernels/test/op_softmax_test.cpp
@@ -17,9 +17,9 @@
 #include <gtest/gtest.h>
 
 using namespace ::testing;
-using exec_aten::ArrayRef;
-using exec_aten::ScalarType;
-using exec_aten::Tensor;
+using executorch::aten::ArrayRef;
+using executorch::aten::ScalarType;
+using executorch::aten::Tensor;
 using torch::executor::testing::TensorFactory;
 
 class OpSoftmaxOutTest : public OperatorTest {
@@ -34,7 +34,7 @@ class OpSoftmaxOutTest : public OperatorTest {
   }
 
   // A generic smoke test that works for the supported dtypes.
-  template <class CTYPE, exec_aten::ScalarType DTYPE>
+  template <class CTYPE, executorch::aten::ScalarType DTYPE>
   void test_dtype() {
     TensorFactory<DTYPE> tf;
 
diff --git a/kernels/test/op_split_copy_test.cpp b/kernels/test/op_split_copy_test.cpp
index 2a02ad6501..76b29fa30b 100644
--- a/kernels/test/op_split_copy_test.cpp
+++ b/kernels/test/op_split_copy_test.cpp
@@ -17,10 +17,10 @@
 #include <gtest/gtest.h>
 
 using namespace ::testing;
-using exec_aten::ArrayRef;
-using exec_aten::ScalarType;
-using exec_aten::Tensor;
-using exec_aten::TensorList;
+using executorch::aten::ArrayRef;
+using executorch::aten::ScalarType;
+using executorch::aten::Tensor;
+using executorch::aten::TensorList;
 using torch::executor::testing::TensorFactory;
 using torch::executor::testing::TensorListFactory;
 
diff --git a/kernels/test/op_split_with_sizes_copy_test.cpp b/kernels/test/op_split_with_sizes_copy_test.cpp
index 91ef94af65..cc81ffff19 100644
--- a/kernels/test/op_split_with_sizes_copy_test.cpp
+++ b/kernels/test/op_split_with_sizes_copy_test.cpp
@@ -20,19 +20,20 @@ using namespace ::testing;
 class OpSplitWithSizesCopyOutTest : public OperatorTest {
  protected:
   void op_split_with_sizes_copy_out(
-      const exec_aten::Tensor& self,
-      exec_aten::ArrayRef<int64_t> split_sizes,
+      const executorch::aten::Tensor& self,
+      executorch::aten::ArrayRef<int64_t> split_sizes,
       int64_t dim,
-      exec_aten::TensorList out) {
+      executorch::aten::TensorList out) {
     return torch::executor::aten::split_with_sizes_copy_outf(
         context_, self, split_sizes, dim, out);
   }
 
-  void test_tensor_shape_dynamism(exec_aten::TensorShapeDynamism dynamism) {
-    torch::executor::testing::TensorFactory<exec_aten::ScalarType::Float>
+  void test_tensor_shape_dynamism(
+      executorch::aten::TensorShapeDynamism dynamism) {
+    torch::executor::testing::TensorFactory<executorch::aten::ScalarType::Float>
         tfFloat;
 
-    exec_aten::Tensor self = tfFloat.make(
+    executorch::aten::Tensor self = tfFloat.make(
         {2, 6, 3},
         {-31.25,  -92.75,  -39.75,  -3.25,   53.875,  88.25,   -0.625,  -1.125,
          14.75,   42.0,    89.875,  -21.125, -8.0,    -64.125, 23.0,    37.0,
@@ -40,17 +41,18 @@ class OpSplitWithSizesCopyOutTest : public OperatorTest {
          -61.375, 13.125,  28.625,  -94.0,   -67.0,   -8.625,  -88.875, -79.125,
          0.375,   -61.375, 65.0,    -99.375});
     ::std::vector<int64_t> split_sizes_vec = {3, 1, 2};
-    exec_aten::ArrayRef<int64_t> split_sizes = exec_aten::ArrayRef<int64_t>(
-        split_sizes_vec.data(), split_sizes_vec.size());
+    executorch::aten::ArrayRef<int64_t> split_sizes =
+        executorch::aten::ArrayRef<int64_t>(
+            split_sizes_vec.data(), split_sizes_vec.size());
     int64_t dim = 1;
 
-    ::std::vector<exec_aten::Tensor> out_vec;
-    if (dynamism == exec_aten::TensorShapeDynamism::STATIC) {
+    ::std::vector<executorch::aten::Tensor> out_vec;
+    if (dynamism == executorch::aten::TensorShapeDynamism::STATIC) {
       out_vec = {
           tfFloat.zeros({2, 3, 3}),
           tfFloat.zeros({2, 1, 3}),
           tfFloat.zeros({2, 2, 3})};
-    } else { // dynamism == exec_aten::TensorShapeDynamism::DYNAMIC_BOUND
+    } else { // dynamism == executorch::aten::TensorShapeDynamism::DYNAMIC_BOUND
       out_vec = {
           tfFloat.zeros(
               {2, 3, 10}, torch::executor::TensorShapeDynamism::DYNAMIC_BOUND),
@@ -60,9 +62,9 @@ class OpSplitWithSizesCopyOutTest : public OperatorTest {
               {2, 2, 10}, torch::executor::TensorShapeDynamism::DYNAMIC_BOUND)};
     }
 
-    exec_aten::TensorList out =
-        exec_aten::TensorList(out_vec.data(), out_vec.size());
-    ::std::vector<exec_aten::Tensor> out_expected_vec = {
+    executorch::aten::TensorList out =
+        executorch::aten::TensorList(out_vec.data(), out_vec.size());
+    ::std::vector<executorch::aten::Tensor> out_expected_vec = {
         tfFloat.make(
             {2, 3, 3},
             {-31.25,
@@ -98,17 +100,18 @@ class OpSplitWithSizesCopyOutTest : public OperatorTest {
              -61.375,
              65.0,
              -99.375})};
-    exec_aten::TensorList out_expected =
-        exec_aten::TensorList(out_expected_vec.data(), out_expected_vec.size());
+    executorch::aten::TensorList out_expected = executorch::aten::TensorList(
+        out_expected_vec.data(), out_expected_vec.size());
     op_split_with_sizes_copy_out(self, split_sizes, dim, out);
     EXPECT_TENSOR_LISTS_CLOSE(out, out_expected);
   }
 };
 
 TEST_F(OpSplitWithSizesCopyOutTest, SanityCheckDim1) {
-  test_tensor_shape_dynamism(exec_aten::TensorShapeDynamism::STATIC);
+  test_tensor_shape_dynamism(executorch::aten::TensorShapeDynamism::STATIC);
 }
 
 TEST_F(OpSplitWithSizesCopyOutTest, DynamicShape) {
-  test_tensor_shape_dynamism(exec_aten::TensorShapeDynamism::DYNAMIC_BOUND);
+  test_tensor_shape_dynamism(
+      executorch::aten::TensorShapeDynamism::DYNAMIC_BOUND);
 }
diff --git a/kernels/test/op_sqrt_test.cpp b/kernels/test/op_sqrt_test.cpp
index 3bbed6ecf5..844d4f5775 100644
--- a/kernels/test/op_sqrt_test.cpp
+++ b/kernels/test/op_sqrt_test.cpp
@@ -13,7 +13,7 @@
 
 #include <cmath>
 
-using exec_aten::Tensor;
+using executorch::aten::Tensor;
 class OpSqrtOutTest
     : public torch::executor::testing::UnaryUfuncRealHBBF16ToFloatHBF16Test {
  protected:
diff --git a/kernels/test/op_squeeze_copy_test.cpp b/kernels/test/op_squeeze_copy_test.cpp
index 68fc241fc4..53f3465c50 100644
--- a/kernels/test/op_squeeze_copy_test.cpp
+++ b/kernels/test/op_squeeze_copy_test.cpp
@@ -16,9 +16,9 @@
 #include <gtest/gtest.h>
 
 using namespace ::testing;
-using exec_aten::ArrayRef;
-using exec_aten::ScalarType;
-using exec_aten::Tensor;
+using executorch::aten::ArrayRef;
+using executorch::aten::ScalarType;
+using executorch::aten::Tensor;
 using torch::executor::testing::TensorFactory;
 
 class OpSqueezeTest : public OperatorTest {
@@ -33,7 +33,7 @@ class OpSqueezeCopyDimsOutTest : public OperatorTest {
  protected:
   Tensor& op_squeeze_copy_dims_out(
       const Tensor& self,
-      exec_aten::ArrayRef<int64_t> dims,
+      executorch::aten::ArrayRef<int64_t> dims,
       Tensor& out) {
     return torch::executor::aten::squeeze_copy_outf(context_, self, dims, out);
   }
@@ -322,9 +322,10 @@ TEST_F(OpSqueezeTest, DynamicShapeUnbound) {
 } // namespace
 
 TEST_F(OpSqueezeCopyDimsOutTest, SanityTest4D) {
-  torch::executor::testing::TensorFactory<exec_aten::ScalarType::Float> tfFloat;
+  torch::executor::testing::TensorFactory<executorch::aten::ScalarType::Float>
+      tfFloat;
 
-  exec_aten::Tensor self = tfFloat.make(
+  executorch::aten::Tensor self = tfFloat.make(
       {1, 2, 1, 5},
       {-26.5,
        5.75,
@@ -337,10 +338,10 @@ TEST_F(OpSqueezeCopyDimsOutTest, SanityTest4D) {
        54.75,
        27.125});
   ::std::vector<int64_t> dim_vec = {0, 2};
-  exec_aten::ArrayRef<int64_t> dim =
-      exec_aten::ArrayRef<int64_t>(dim_vec.data(), dim_vec.size());
-  exec_aten::Tensor out = tfFloat.zeros({2, 5});
-  exec_aten::Tensor out_expected = tfFloat.make(
+  executorch::aten::ArrayRef<int64_t> dim =
+      executorch::aten::ArrayRef<int64_t>(dim_vec.data(), dim_vec.size());
+  executorch::aten::Tensor out = tfFloat.zeros({2, 5});
+  executorch::aten::Tensor out_expected = tfFloat.make(
       {2, 5},
       {-26.5,
        5.75,
@@ -357,9 +358,10 @@ TEST_F(OpSqueezeCopyDimsOutTest, SanityTest4D) {
 }
 
 TEST_F(OpSqueezeCopyDimsOutTest, SanityCheck5D) {
-  torch::executor::testing::TensorFactory<exec_aten::ScalarType::Float> tfFloat;
+  torch::executor::testing::TensorFactory<executorch::aten::ScalarType::Float>
+      tfFloat;
 
-  exec_aten::Tensor self = tfFloat.make(
+  executorch::aten::Tensor self = tfFloat.make(
       {1, 2, 1, 5, 4},
       {-73.5,  -67.625, -54.375, 51.625,  -11.125, -28.625, -40.75,  45.625,
        84.375, 65.625,  95.125,  -47.125, -21.25,  32.25,   -86.125, 55.875,
@@ -367,10 +369,10 @@ TEST_F(OpSqueezeCopyDimsOutTest, SanityCheck5D) {
        64.125, -59.875, 59.75,   -52.25,  59.5,    44.875,  -51.25,  20.875,
        -67.0,  32.5,    -26.625, 83.75,   45.5,    85.5,    -92.875, 60.0});
   ::std::vector<int64_t> dim_vec = {0, 3, 2, 1};
-  exec_aten::ArrayRef<int64_t> dim =
-      exec_aten::ArrayRef<int64_t>(dim_vec.data(), dim_vec.size());
-  exec_aten::Tensor out = tfFloat.zeros({2, 5, 4});
-  exec_aten::Tensor out_expected = tfFloat.make(
+  executorch::aten::ArrayRef<int64_t> dim =
+      executorch::aten::ArrayRef<int64_t>(dim_vec.data(), dim_vec.size());
+  executorch::aten::Tensor out = tfFloat.zeros({2, 5, 4});
+  executorch::aten::Tensor out_expected = tfFloat.make(
       {2, 5, 4},
       {-73.5,  -67.625, -54.375, 51.625,  -11.125, -28.625, -40.75,  45.625,
        84.375, 65.625,  95.125,  -47.125, -21.25,  32.25,   -86.125, 55.875,
@@ -382,9 +384,10 @@ TEST_F(OpSqueezeCopyDimsOutTest, SanityCheck5D) {
 }
 
 TEST_F(OpSqueezeCopyDimsOutTest, SanityCheck5DUnchanged) {
-  torch::executor::testing::TensorFactory<exec_aten::ScalarType::Float> tfFloat;
+  torch::executor::testing::TensorFactory<executorch::aten::ScalarType::Float>
+      tfFloat;
 
-  exec_aten::Tensor self = tfFloat.make(
+  executorch::aten::Tensor self = tfFloat.make(
       {1, 2, 1, 5, 4},
       {-0.375,  -40.125, 5.75,   21.25,   -34.875, -19.375, 15.75,   -60.75,
        -41.75,  53.125,  -76.0,  -64.25,  -84.5,   -37.25,  -39.125, 22.875,
@@ -392,10 +395,10 @@ TEST_F(OpSqueezeCopyDimsOutTest, SanityCheck5DUnchanged) {
        -14.875, 78.5,    43.0,   -78.625, -58.625, -58.375, 47.5,    -67.375,
        -82.375, 35.0,    83.25,  49.625,  -9.875,  -46.75,  17.875,  -68.375});
   ::std::vector<int64_t> dim_vec = {1, 4, 3};
-  exec_aten::ArrayRef<int64_t> dim =
-      exec_aten::ArrayRef<int64_t>(dim_vec.data(), dim_vec.size());
-  exec_aten::Tensor out = tfFloat.zeros({1, 2, 1, 5, 4});
-  exec_aten::Tensor out_expected = tfFloat.make(
+  executorch::aten::ArrayRef<int64_t> dim =
+      executorch::aten::ArrayRef<int64_t>(dim_vec.data(), dim_vec.size());
+  executorch::aten::Tensor out = tfFloat.zeros({1, 2, 1, 5, 4});
+  executorch::aten::Tensor out_expected = tfFloat.make(
       {1, 2, 1, 5, 4},
       {-0.375,  -40.125, 5.75,   21.25,   -34.875, -19.375, 15.75,   -60.75,
        -41.75,  53.125,  -76.0,  -64.25,  -84.5,   -37.25,  -39.125, 22.875,
diff --git a/kernels/test/op_stack_test.cpp b/kernels/test/op_stack_test.cpp
index b621ee6745..9a102878c0 100644
--- a/kernels/test/op_stack_test.cpp
+++ b/kernels/test/op_stack_test.cpp
@@ -18,10 +18,10 @@
 #include <cstdint>
 
 using namespace ::testing;
-using exec_aten::ArrayRef;
-using exec_aten::ScalarType;
-using exec_aten::Tensor;
-using exec_aten::TensorList;
+using executorch::aten::ArrayRef;
+using executorch::aten::ScalarType;
+using executorch::aten::Tensor;
+using executorch::aten::TensorList;
 using torch::executor::testing::TensorFactory;
 
 class OpStackOutTest : public OperatorTest {
@@ -30,7 +30,7 @@ class OpStackOutTest : public OperatorTest {
     return torch::executor::aten::stack_outf(context_, tensors, dim, out);
   }
 
-  template <class CTYPE, exec_aten::ScalarType DTYPE>
+  template <class CTYPE, executorch::aten::ScalarType DTYPE>
   void test_dtype() {
     TensorFactory<DTYPE> tf;
 
diff --git a/kernels/test/op_sub_test.cpp b/kernels/test/op_sub_test.cpp
index f0285bc85e..39fc9e1492 100644
--- a/kernels/test/op_sub_test.cpp
+++ b/kernels/test/op_sub_test.cpp
@@ -575,13 +575,14 @@ TEST_F(OpSubScalarOutTest, OptimizedSanityCheck) {
 }
 
 TEST_F(OpSubScalarOutTest, DtypeTest_float16_float_int_float16) {
-  torch::executor::testing::TensorFactory<exec_aten::ScalarType::Half> tfHalf;
-
-  exec_aten::Tensor self = tfHalf.ones({2, 2});
-  exec_aten::Scalar other = exec_aten::Scalar(-1.0);
-  exec_aten::Scalar alpha = exec_aten::Scalar(1);
-  exec_aten::Tensor out = tfHalf.zeros({2, 2});
-  exec_aten::Tensor out_expected = tfHalf.full({2, 2}, 2.0);
+  torch::executor::testing::TensorFactory<executorch::aten::ScalarType::Half>
+      tfHalf;
+
+  executorch::aten::Tensor self = tfHalf.ones({2, 2});
+  executorch::aten::Scalar other = executorch::aten::Scalar(-1.0);
+  executorch::aten::Scalar alpha = executorch::aten::Scalar(1);
+  executorch::aten::Tensor out = tfHalf.zeros({2, 2});
+  executorch::aten::Tensor out_expected = tfHalf.full({2, 2}, 2.0);
   op_sub_scalar_out(self, other, alpha, out);
   EXPECT_TENSOR_CLOSE(out, out_expected);
 }
diff --git a/kernels/test/op_sum_test.cpp b/kernels/test/op_sum_test.cpp
index 9f1700a901..0f3936cc31 100644
--- a/kernels/test/op_sum_test.cpp
+++ b/kernels/test/op_sum_test.cpp
@@ -18,10 +18,10 @@
 #include <cmath>
 
 using namespace ::testing;
-using exec_aten::ArrayRef;
-using exec_aten::optional;
-using exec_aten::ScalarType;
-using exec_aten::Tensor;
+using executorch::aten::ArrayRef;
+using executorch::aten::optional;
+using executorch::aten::ScalarType;
+using executorch::aten::Tensor;
 using torch::executor::testing::TensorFactory;
 
 class OpSumOutTest : public OperatorTest {
diff --git a/kernels/test/op_t_copy_test.cpp b/kernels/test/op_t_copy_test.cpp
index e7029d935b..142fe3050f 100644
--- a/kernels/test/op_t_copy_test.cpp
+++ b/kernels/test/op_t_copy_test.cpp
@@ -16,9 +16,9 @@
 #include <gtest/gtest.h>
 
 using namespace ::testing;
-using exec_aten::ArrayRef;
-using exec_aten::ScalarType;
-using exec_aten::Tensor;
+using executorch::aten::ArrayRef;
+using executorch::aten::ScalarType;
+using executorch::aten::Tensor;
 using torch::executor::testing::TensorFactory;
 
 class OpTCopyTest : public OperatorTest {
diff --git a/kernels/test/op_tan_test.cpp b/kernels/test/op_tan_test.cpp
index bf0547d338..bb8c93dcb9 100644
--- a/kernels/test/op_tan_test.cpp
+++ b/kernels/test/op_tan_test.cpp
@@ -13,7 +13,7 @@
 
 #include <cmath>
 
-using exec_aten::Tensor;
+using executorch::aten::Tensor;
 class OpTanOutTest
     : public torch::executor::testing::UnaryUfuncRealHBBF16ToFloatHBF16Test {
  protected:
diff --git a/kernels/test/op_tanh_test.cpp b/kernels/test/op_tanh_test.cpp
index 7396353b56..7b9a2e7cbc 100644
--- a/kernels/test/op_tanh_test.cpp
+++ b/kernels/test/op_tanh_test.cpp
@@ -13,7 +13,7 @@
 
 #include <cmath>
 
-using exec_aten::Tensor;
+using executorch::aten::Tensor;
 class OpTanhOutTest
     : public torch::executor::testing::UnaryUfuncRealHBBF16ToFloatHBF16Test {
  protected:
diff --git a/kernels/test/op_to_copy_test.cpp b/kernels/test/op_to_copy_test.cpp
index 0a6529e736..0641f45b46 100644
--- a/kernels/test/op_to_copy_test.cpp
+++ b/kernels/test/op_to_copy_test.cpp
@@ -21,10 +21,10 @@
 #include <gtest/gtest.h>
 
 using namespace ::testing;
-using exec_aten::MemoryFormat;
-using exec_aten::optional;
-using exec_aten::ScalarType;
-using exec_aten::Tensor;
+using executorch::aten::MemoryFormat;
+using executorch::aten::optional;
+using executorch::aten::ScalarType;
+using executorch::aten::Tensor;
 using torch::executor::testing::TensorFactory;
 
 // To further emphasize the accuracy of our op_to, we test the conversion
@@ -37,8 +37,8 @@ typedef std::map<
           std::variant<
             std::vector<float>,
             std::vector<double>,
-            std::vector<exec_aten::Half>,
-            std::vector<exec_aten::BFloat16>>>
+            std::vector<executorch::aten::Half>,
+            std::vector<executorch::aten::BFloat16>>>
         FloatingTypeToDataMap;
 
 typedef std::map<
@@ -108,7 +108,7 @@ class OpToTest : public OperatorTest {
       Tensor ret = op_to_copy_out(
           /*self=*/input,
           /*non_blocking=*/false,
-          exec_aten::MemoryFormat::Contiguous,
+          executorch::aten::MemoryFormat::Contiguous,
           output);
 
       Tensor expected = tf_out.make(test_case.sizes, data_out);
@@ -135,7 +135,7 @@ class OpToTest : public OperatorTest {
     Tensor ret = op_to_copy_out(
         /*self=*/input,
         /*non_blocking=*/false,
-        exec_aten::MemoryFormat::Contiguous,
+        executorch::aten::MemoryFormat::Contiguous,
         output);
 
     Tensor expected = tf_out.make({(int)data_out.size()}, data_out);
@@ -163,7 +163,7 @@ class OpToTest : public OperatorTest {
     Tensor ret = op_to_copy_out(
         /*self=*/input,
         /*non_blocking=*/false,
-        exec_aten::MemoryFormat::Contiguous,
+        executorch::aten::MemoryFormat::Contiguous,
         output);
 
     Tensor expected = tf_out.make({(int)data_out.size()}, data_out);
@@ -205,7 +205,7 @@ class OpToTest : public OperatorTest {
     Tensor ret = op_to_copy_out(
         /*self=*/input,
         /*non_blocking=*/false,
-        exec_aten::MemoryFormat::Contiguous,
+        executorch::aten::MemoryFormat::Contiguous,
         output);
 
     Tensor expected = tf_out.make(test_case.sizes, test_case.data_out);
@@ -383,8 +383,8 @@ TEST_F(OpToTest, HardcodeFloatConvertInt) {
       -0.30919688936285893988};
   // clang-format on
 
-  std::vector<exec_aten::Half> half_data;
-  std::vector<exec_aten::BFloat16> bf16_data;
+  std::vector<executorch::aten::Half> half_data;
+  std::vector<executorch::aten::BFloat16> bf16_data;
   for (auto d : double_data) {
     half_data.emplace_back(d);
     bf16_data.emplace_back(d);
@@ -403,8 +403,8 @@ TEST_F(OpToTest, HardcodeFloatConvertInt) {
   FloatingTypeToDataMap floating_point_data;
   floating_point_data[typeid(float)] = float_data;
   floating_point_data[typeid(double)] = double_data;
-  floating_point_data[typeid(exec_aten::Half)] = half_data;
-  floating_point_data[typeid(exec_aten::BFloat16)] = bf16_data;
+  floating_point_data[typeid(executorch::aten::Half)] = half_data;
+  floating_point_data[typeid(executorch::aten::BFloat16)] = bf16_data;
 
   // Gathering all int data together for better traversial
   IntTypeToDataMap int_data;
@@ -438,7 +438,7 @@ TEST_F(OpToTest, MismatchedSizesDie) {
       op_to_copy_out(
           input,
           /*non_blocking=*/false,
-          exec_aten::MemoryFormat::Contiguous,
+          executorch::aten::MemoryFormat::Contiguous,
           out));
 }
 
@@ -460,14 +460,14 @@ TEST_F(OpToTest, MismatchedMemoryFormatDies) {
       op_to_copy_out(
           input,
           /*non_blocking=*/false,
-          static_cast<exec_aten::MemoryFormat>(55),
+          static_cast<executorch::aten::MemoryFormat>(55),
           out));
   // memory format can be null
   EXPECT_TENSOR_EQ(
       op_to_copy_out(
           input,
           /*non_blocking=*/false,
-          /*memory_format=*/exec_aten::nullopt,
+          /*memory_format=*/executorch::aten::nullopt,
           out),
       input);
 }
@@ -485,7 +485,7 @@ TEST_F(OpToTest, MismatchedBlockingDie) {
       op_to_copy_out(
           input,
           /*non_blocking=*/true,
-          exec_aten::MemoryFormat::Contiguous,
+          executorch::aten::MemoryFormat::Contiguous,
           out));
 }
 
diff --git a/kernels/test/op_topk_test.cpp b/kernels/test/op_topk_test.cpp
index 54bdde9eda..46098a81b6 100644
--- a/kernels/test/op_topk_test.cpp
+++ b/kernels/test/op_topk_test.cpp
@@ -18,9 +18,9 @@
 #include <algorithm>
 
 using namespace ::testing;
-using exec_aten::IntArrayRef;
-using exec_aten::ScalarType;
-using exec_aten::Tensor;
+using executorch::aten::IntArrayRef;
+using executorch::aten::ScalarType;
+using executorch::aten::Tensor;
 using executorch::runtime::MemoryAllocator;
 using torch::executor::testing::TensorFactory;
 
diff --git a/kernels/test/op_transpose_copy_test.cpp b/kernels/test/op_transpose_copy_test.cpp
index c874257a53..e9f9bff4ac 100644
--- a/kernels/test/op_transpose_copy_test.cpp
+++ b/kernels/test/op_transpose_copy_test.cpp
@@ -16,9 +16,9 @@
 #include <gtest/gtest.h>
 
 using namespace ::testing;
-using exec_aten::ArrayRef;
-using exec_aten::ScalarType;
-using exec_aten::Tensor;
+using executorch::aten::ArrayRef;
+using executorch::aten::ScalarType;
+using executorch::aten::Tensor;
 using torch::executor::testing::TensorFactory;
 
 class OpTransposeIntCopyTest : public OperatorTest {
diff --git a/kernels/test/op_tril_test.cpp b/kernels/test/op_tril_test.cpp
index 7bb2362113..9a2347e39c 100644
--- a/kernels/test/op_tril_test.cpp
+++ b/kernels/test/op_tril_test.cpp
@@ -16,8 +16,8 @@
 #include <gtest/gtest.h>
 
 using namespace ::testing;
-using exec_aten::ScalarType;
-using exec_aten::Tensor;
+using executorch::aten::ScalarType;
+using executorch::aten::Tensor;
 using torch::executor::testing::TensorFactory;
 
 class OpTrilTest : public OperatorTest {
diff --git a/kernels/test/op_trunc_test.cpp b/kernels/test/op_trunc_test.cpp
index d380886b29..e17159f8a1 100644
--- a/kernels/test/op_trunc_test.cpp
+++ b/kernels/test/op_trunc_test.cpp
@@ -16,9 +16,9 @@
 #include <gtest/gtest.h>
 
 using namespace ::testing;
-using exec_aten::Scalar;
-using exec_aten::ScalarType;
-using exec_aten::Tensor;
+using executorch::aten::Scalar;
+using executorch::aten::ScalarType;
+using executorch::aten::Tensor;
 using torch::executor::testing::SupportedFeatures;
 using torch::executor::testing::TensorFactory;
 
diff --git a/kernels/test/op_unbind_copy_test.cpp b/kernels/test/op_unbind_copy_test.cpp
index 26aa0a7f9c..1dd5c3cebf 100644
--- a/kernels/test/op_unbind_copy_test.cpp
+++ b/kernels/test/op_unbind_copy_test.cpp
@@ -17,10 +17,10 @@
 #include <gtest/gtest.h>
 
 using namespace ::testing;
-using exec_aten::ArrayRef;
-using exec_aten::ScalarType;
-using exec_aten::Tensor;
-using exec_aten::TensorList;
+using executorch::aten::ArrayRef;
+using executorch::aten::ScalarType;
+using executorch::aten::Tensor;
+using executorch::aten::TensorList;
 using torch::executor::testing::TensorFactory;
 using torch::executor::testing::TensorListFactory;
 
diff --git a/kernels/test/op_unsqueeze_copy_test.cpp b/kernels/test/op_unsqueeze_copy_test.cpp
index 9aa7845591..d90d69a1b2 100644
--- a/kernels/test/op_unsqueeze_copy_test.cpp
+++ b/kernels/test/op_unsqueeze_copy_test.cpp
@@ -19,8 +19,8 @@
 #include <cstdio>
 
 using namespace ::testing;
-using exec_aten::ScalarType;
-using exec_aten::Tensor;
+using executorch::aten::ScalarType;
+using executorch::aten::Tensor;
 using torch::executor::testing::TensorFactory;
 
 class OpUnsqueezeTest : public OperatorTest {
@@ -82,7 +82,7 @@ class OpUnsqueezeTest : public OperatorTest {
 #ifdef USE_ATEN_LIB
       const c10::IntArrayRef& size_in,
 #else
-      const exec_aten::ArrayRef<int32_t>& size_in,
+      const executorch::aten::ArrayRef<int32_t>& size_in,
 #endif
       int64_t dim) {
     std::vector<int32_t> size_out(size_in.size() + 1);
diff --git a/kernels/test/op_upsample_bilinear2d_test.cpp b/kernels/test/op_upsample_bilinear2d_test.cpp
index 4a97068560..02307d3d2f 100644
--- a/kernels/test/op_upsample_bilinear2d_test.cpp
+++ b/kernels/test/op_upsample_bilinear2d_test.cpp
@@ -15,9 +15,9 @@
 
 #include <gtest/gtest.h>
 
-using exec_aten::optional;
-using exec_aten::ScalarType;
-using exec_aten::Tensor;
+using executorch::aten::optional;
+using executorch::aten::ScalarType;
+using executorch::aten::Tensor;
 using torch::executor::testing::SupportedFeatures;
 using torch::executor::testing::TensorFactory;
 
@@ -25,7 +25,7 @@ using torch::executor::testing::TensorFactory;
 template <class T>
 using OptionalArrayRef = std::optional<c10::ArrayRef<T>>;
 #else
-using exec_aten::OptionalArrayRef;
+using executorch::aten::OptionalArrayRef;
 #endif
 
 class OpUpsampleBilinear2dTest : public OperatorTest {
@@ -40,10 +40,16 @@ class OpUpsampleBilinear2dTest : public OperatorTest {
         context_, in, output_size, align_corners, scale_factors, out);
   }
 
-  template <class CTYPE, exec_aten::ScalarType DTYPE>
+  template <class CTYPE, executorch::aten::ScalarType DTYPE>
   void test_upsample_bilinear2d_dtype() {
     TensorFactory<DTYPE> tf;
 
+    if (torch::executor::testing::SupportedFeatures::get()->is_aten &&
+        (DTYPE == ScalarType::Char || DTYPE == ScalarType::Short ||
+         DTYPE == ScalarType::Int || DTYPE == ScalarType::Long)) {
+      // not supported.
+      return;
+    }
     const auto input = tf.make({1, 1, 1, 2}, {1, 4});
     std::array<int64_t, 2> output_size = {1, 4};
     auto out = tf.zeros({1, 1, 1, 4});
diff --git a/kernels/test/op_upsample_nearest2d_test.cpp b/kernels/test/op_upsample_nearest2d_test.cpp
index 93737b436a..ce6d2ad4d6 100644
--- a/kernels/test/op_upsample_nearest2d_test.cpp
+++ b/kernels/test/op_upsample_nearest2d_test.cpp
@@ -17,9 +17,9 @@
 
 #include <gtest/gtest.h>
 
-using exec_aten::optional;
-using exec_aten::ScalarType;
-using exec_aten::Tensor;
+using executorch::aten::optional;
+using executorch::aten::ScalarType;
+using executorch::aten::Tensor;
 using torch::executor::testing::SupportedFeatures;
 using torch::executor::testing::TensorFactory;
 
@@ -27,7 +27,7 @@ using torch::executor::testing::TensorFactory;
 template <class T>
 using OptionalArrayRef = std::optional<c10::ArrayRef<T>>;
 #else
-using exec_aten::OptionalArrayRef;
+using executorch::aten::OptionalArrayRef;
 #endif
 
 class OpUpsampleNearest2dTest : public OperatorTest {
@@ -41,10 +41,16 @@ class OpUpsampleNearest2dTest : public OperatorTest {
         context_, in, output_size, scale_factors, out);
   }
 
-  template <class CTYPE, exec_aten::ScalarType DTYPE>
+  template <class CTYPE, executorch::aten::ScalarType DTYPE>
   void test_upsample_nearest2d_dtype() {
     TensorFactory<DTYPE> tf;
 
+    if (torch::executor::testing::SupportedFeatures::get()->is_aten &&
+        (DTYPE == ScalarType::Char || DTYPE == ScalarType::Short ||
+         DTYPE == ScalarType::Int || DTYPE == ScalarType::Long)) {
+      // not supported.
+      return;
+    }
     const auto input = tf.make({1, 1, 2, 2}, {1, 2, 3, 4});
     std::array<int64_t, 2> output_size = {4, 4};
     auto out = tf.zeros({1, 1, 4, 4});
diff --git a/kernels/test/op_var_test.cpp b/kernels/test/op_var_test.cpp
index 749fd62d1c..fbfd16f1b2 100644
--- a/kernels/test/op_var_test.cpp
+++ b/kernels/test/op_var_test.cpp
@@ -18,11 +18,11 @@
 #include <cmath>
 
 using namespace ::testing;
-using exec_aten::ArrayRef;
-using exec_aten::optional;
-using exec_aten::Scalar;
-using exec_aten::ScalarType;
-using exec_aten::Tensor;
+using executorch::aten::ArrayRef;
+using executorch::aten::optional;
+using executorch::aten::Scalar;
+using executorch::aten::ScalarType;
+using executorch::aten::Tensor;
 using torch::executor::testing::TensorFactory;
 
 namespace {
@@ -328,6 +328,9 @@ TEST_F(OpVarOutTest, InvalidDTypeDies) {
 }
 
 TEST_F(OpVarOutTest, AllFloatInputFloatOutputPasses) {
+  if (torch::executor::testing::SupportedFeatures::get()->is_aten) {
+    GTEST_SKIP() << "ATen supports fewer dtypes";
+  }
   // Use a two layer switch to hanldle each possible data pair
 #define TEST_KERNEL(INPUT_CTYPE, INPUT_DTYPE, OUTPUT_CTYPE, OUTPUT_DTYPE) \
   test_var_out_dtype<ScalarType::INPUT_DTYPE, ScalarType::OUTPUT_DTYPE>();
@@ -340,6 +343,22 @@ TEST_F(OpVarOutTest, AllFloatInputFloatOutputPasses) {
 #undef TEST_KERNEL
 }
 
+TEST_F(OpVarOutTest, AllFloatInputFloatOutputPasses_Aten) {
+  if (!torch::executor::testing::SupportedFeatures::get()->is_aten) {
+    GTEST_SKIP() << "ATen-specific variant of test case";
+  }
+  // Use a two layer switch to hanldle each possible data pair
+#define TEST_KERNEL(INPUT_CTYPE, INPUT_DTYPE, OUTPUT_CTYPE, OUTPUT_DTYPE) \
+  test_var_out_dtype<ScalarType::INPUT_DTYPE, ScalarType::OUTPUT_DTYPE>();
+
+#define TEST_ENTRY(INPUT_CTYPE, INPUT_DTYPE) \
+  ET_FORALL_FLOAT_TYPES_WITH2(INPUT_CTYPE, INPUT_DTYPE, TEST_KERNEL);
+
+  ET_FORALL_FLOAT_TYPES(TEST_ENTRY);
+#undef TEST_ENTRY
+#undef TEST_KERNEL
+}
+
 TEST_F(OpVarOutTest, InfinityAndNANTest) {
   TensorFactory<ScalarType::Float> tf_float;
   // clang-format off
diff --git a/kernels/test/op_view_copy_test.cpp b/kernels/test/op_view_copy_test.cpp
index 1bd3757c1d..73a59fa5bd 100644
--- a/kernels/test/op_view_copy_test.cpp
+++ b/kernels/test/op_view_copy_test.cpp
@@ -17,9 +17,9 @@
 #include <gtest/gtest.h>
 
 using namespace ::testing;
-using exec_aten::IntArrayRef;
-using exec_aten::ScalarType;
-using exec_aten::Tensor;
+using executorch::aten::IntArrayRef;
+using executorch::aten::ScalarType;
+using executorch::aten::Tensor;
 using torch::executor::testing::TensorFactory;
 
 class OpViewTest : public OperatorTest {
@@ -28,7 +28,7 @@ class OpViewTest : public OperatorTest {
     return torch::executor::aten::view_copy_outf(context_, self, size, out);
   }
 
-  template <class CTYPE, exec_aten::ScalarType DTYPE>
+  template <class CTYPE, executorch::aten::ScalarType DTYPE>
   void run_view_test_cases(
       const Tensor& input,
       const std::vector<std::vector<int32_t>>& out_shapes) {
@@ -47,7 +47,7 @@ class OpViewTest : public OperatorTest {
 
       Tensor ret = op_view_copy_out(
           input,
-          exec_aten::ArrayRef<int64_t>(
+          executorch::aten::ArrayRef<int64_t>(
               size_int64_t.data(), size_int64_t.size()),
           out);
       EXPECT_TENSOR_EQ(out, ret);
@@ -56,7 +56,7 @@ class OpViewTest : public OperatorTest {
   }
 
   // Test if op_view_copy_out works well under all kinds of legal input type.
-  template <class CTYPE, exec_aten::ScalarType DTYPE>
+  template <class CTYPE, executorch::aten::ScalarType DTYPE>
   void test_dtype() {
     TensorFactory<DTYPE> tf;
     Tensor input = tf.make(/*sizes=*/{2, 4}, /*data=*/{0, 1, 1, 1, 0, 1, 0, 1});
@@ -163,7 +163,7 @@ TEST_F(OpViewTest, InputOutputMismatchedSizesDie) {
       context_,
       op_view_copy_out(
           input,
-          exec_aten::ArrayRef<int64_t>(
+          executorch::aten::ArrayRef<int64_t>(
               size_int64_t.data(), size_int64_t.size()),
           out));
 }
@@ -185,7 +185,7 @@ TEST_F(OpViewTest, SizeOutputMismatchedSizesDie) {
       context_,
       op_view_copy_out(
           input,
-          exec_aten::ArrayRef<int64_t>(
+          executorch::aten::ArrayRef<int64_t>(
               size_int64_t.data(), size_int64_t.size()),
           out));
 }
@@ -208,7 +208,7 @@ TEST_F(OpViewTest, MismatchedTypesDie) {
       context_,
       op_view_copy_out(
           input,
-          exec_aten::ArrayRef<int64_t>(
+          executorch::aten::ArrayRef<int64_t>(
               size_int64_t.data(), size_int64_t.size()),
           out));
 }
@@ -233,7 +233,7 @@ TEST_F(OpViewTest, SizeInfer) {
   // Inferring one dimension is valid.
   op_view_copy_out(
       input,
-      exec_aten::ArrayRef<int64_t>(
+      executorch::aten::ArrayRef<int64_t>(
           valid_size_int64_t.data(), valid_size_int64_t.size()),
       out);
   EXPECT_TENSOR_DATA_EQ(input, out);
@@ -242,7 +242,7 @@ TEST_F(OpViewTest, SizeInfer) {
       context_,
       op_view_copy_out(
           input,
-          exec_aten::ArrayRef<int64_t>(
+          executorch::aten::ArrayRef<int64_t>(
               invalid_size_int64_t.data(), invalid_size_int64_t.size()),
           out));
 }
@@ -263,7 +263,8 @@ TEST_F(OpViewTest, UpperBoundOutTensor) {
 
   op_view_copy_out(
       input,
-      exec_aten::ArrayRef<int64_t>(size_int64_t.data(), size_int64_t.size()),
+      executorch::aten::ArrayRef<int64_t>(
+          size_int64_t.data(), size_int64_t.size()),
       output);
   EXPECT_TENSOR_EQ(ref_output, output);
 
@@ -279,7 +280,8 @@ TEST_F(OpViewTest, UpperBoundOutTensor) {
 
   op_view_copy_out(
       input,
-      exec_aten::ArrayRef<int64_t>(size_int64_t.data(), size_int64_t.size()),
+      executorch::aten::ArrayRef<int64_t>(
+          size_int64_t.data(), size_int64_t.size()),
       output);
   EXPECT_TENSOR_EQ(ref_output, output);
 }
diff --git a/kernels/test/op_where_test.cpp b/kernels/test/op_where_test.cpp
index 7ddbbef2d7..c9e845e38a 100644
--- a/kernels/test/op_where_test.cpp
+++ b/kernels/test/op_where_test.cpp
@@ -16,8 +16,8 @@
 #include <gtest/gtest.h>
 
 using namespace ::testing;
-using exec_aten::ScalarType;
-using exec_aten::Tensor;
+using executorch::aten::ScalarType;
+using executorch::aten::Tensor;
 using torch::executor::testing::SupportedFeatures;
 using torch::executor::testing::TensorFactory;
 
diff --git a/kernels/test/op_zeros_test.cpp b/kernels/test/op_zeros_test.cpp
index 8c604853e2..96586f91df 100644
--- a/kernels/test/op_zeros_test.cpp
+++ b/kernels/test/op_zeros_test.cpp
@@ -17,9 +17,9 @@
 #include <gtest/gtest.h>
 
 using namespace ::testing;
-using exec_aten::IntArrayRef;
-using exec_aten::ScalarType;
-using exec_aten::Tensor;
+using executorch::aten::IntArrayRef;
+using executorch::aten::ScalarType;
+using executorch::aten::Tensor;
 using torch::executor::testing::TensorFactory;
 
 class OpZerosOutTest : public OperatorTest {
@@ -32,7 +32,7 @@ class OpZerosOutTest : public OperatorTest {
   void test_zeros_out(std::vector<int32_t>&& size_int32_t) {
     TensorFactory<DTYPE> tf;
     std::vector<int64_t> sizes(size_int32_t.begin(), size_int32_t.end());
-    auto aref = exec_aten::ArrayRef<int64_t>(sizes.data(), sizes.size());
+    auto aref = executorch::aten::ArrayRef<int64_t>(sizes.data(), sizes.size());
     Tensor out = tf.ones(size_int32_t);
 
     op_zeros_out(aref, out);
@@ -55,7 +55,7 @@ TEST_F(OpZerosOutTest, DynamicShapeUpperBoundSameAsExpected) {
   Tensor expected = tf.zeros({3, 2});
 
   int64_t sizes[2] = {3, 2};
-  auto sizes_aref = exec_aten::ArrayRef<int64_t>(sizes);
+  auto sizes_aref = executorch::aten::ArrayRef<int64_t>(sizes);
   Tensor out =
       tf.ones({3, 2}, torch::executor::TensorShapeDynamism::DYNAMIC_BOUND);
   op_zeros_out(sizes_aref, out);
@@ -67,7 +67,7 @@ TEST_F(OpZerosOutTest, DynamicShapeUpperBoundLargerThanExpected) {
   Tensor expected = tf.zeros({3, 2});
 
   int64_t sizes[2] = {3, 2};
-  auto sizes_aref = exec_aten::ArrayRef<int64_t>(sizes);
+  auto sizes_aref = executorch::aten::ArrayRef<int64_t>(sizes);
   Tensor out =
       tf.ones({10, 10}, torch::executor::TensorShapeDynamism::DYNAMIC_BOUND);
   op_zeros_out(sizes_aref, out);
@@ -82,7 +82,7 @@ TEST_F(OpZerosOutTest, DynamicShapeUnbound) {
   Tensor expected = tf.zeros({3, 2});
 
   int64_t sizes[2] = {3, 2};
-  auto sizes_aref = exec_aten::ArrayRef<int64_t>(sizes);
+  auto sizes_aref = executorch::aten::ArrayRef<int64_t>(sizes);
   Tensor out =
       tf.ones({1, 1}, torch::executor::TensorShapeDynamism::DYNAMIC_UNBOUND);
   op_zeros_out(sizes_aref, out);
diff --git a/kernels/test/test_case_gen.py b/kernels/test/test_case_gen.py
index 2c00644653..707c1d6d5c 100644
--- a/kernels/test/test_case_gen.py
+++ b/kernels/test/test_case_gen.py
@@ -398,7 +398,7 @@ def gen_test_case_op_clone():
         make_simple_generated_case(
             torch.ones(10, 10),
             ArgForPyTorch(
-                EnumArg("exec_aten::MemoryFormat::Contiguous"),
+                EnumArg("executorch::aten::MemoryFormat::Contiguous"),
                 "memory_format",
                 torch.contiguous_format,
             ),
@@ -406,7 +406,7 @@ def gen_test_case_op_clone():
         )
         + make_test_cases_dynamic_shape(
             x,
-            EnumArg("exec_aten::MemoryFormat::Contiguous"),
+            EnumArg("executorch::aten::MemoryFormat::Contiguous"),
             torch.clone(x, memory_format=torch.contiguous_format),
         ),
     )
diff --git a/pyproject.toml b/pyproject.toml
index 6f13f1e1c5..fb4196d99b 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -61,6 +61,7 @@ dependencies=[
   "parameterized",
   "pytest",
   "pytest-xdist",
+  "pytest-rerunfailures",
   "pyyaml",
   "ruamel.yaml",
   "sympy",
diff --git a/runtime/core/evalue.cpp b/runtime/core/evalue.cpp
index adc3c7f47b..e4d156218c 100644
--- a/runtime/core/evalue.cpp
+++ b/runtime/core/evalue.cpp
@@ -11,20 +11,24 @@
 namespace executorch {
 namespace runtime {
 template <>
-exec_aten::ArrayRef<exec_aten::optional<exec_aten::Tensor>>
-BoxedEvalueList<exec_aten::optional<exec_aten::Tensor>>::get() const {
-  for (typename exec_aten::ArrayRef<
-           exec_aten::optional<exec_aten::Tensor>>::size_type i = 0;
+executorch::aten::ArrayRef<executorch::aten::optional<executorch::aten::Tensor>>
+BoxedEvalueList<executorch::aten::optional<executorch::aten::Tensor>>::get()
+    const {
+  for (typename executorch::aten::ArrayRef<
+           executorch::aten::optional<executorch::aten::Tensor>>::size_type i =
+           0;
        i < wrapped_vals_.size();
        i++) {
     if (wrapped_vals_[i] == nullptr) {
-      unwrapped_vals_[i] = exec_aten::nullopt;
+      unwrapped_vals_[i] = executorch::aten::nullopt;
     } else {
       unwrapped_vals_[i] =
-          wrapped_vals_[i]->to<exec_aten::optional<exec_aten::Tensor>>();
+          wrapped_vals_[i]
+              ->to<executorch::aten::optional<executorch::aten::Tensor>>();
     }
   }
-  return exec_aten::ArrayRef<exec_aten::optional<exec_aten::Tensor>>{
+  return executorch::aten::ArrayRef<
+      executorch::aten::optional<executorch::aten::Tensor>>{
       unwrapped_vals_, wrapped_vals_.size()};
 }
 } // namespace runtime
diff --git a/runtime/core/exec_aten/exec_aten.h b/runtime/core/exec_aten/exec_aten.h
index 4b920504e6..828aa779b0 100644
--- a/runtime/core/exec_aten/exec_aten.h
+++ b/runtime/core/exec_aten/exec_aten.h
@@ -149,6 +149,6 @@ namespace exec_aten = executorch::aten;
 
 namespace torch {
 namespace executor {
-using TensorList = exec_aten::TensorList;
+using TensorList = ::executorch::aten::TensorList;
 } // namespace executor
 } // namespace torch
diff --git a/runtime/core/exec_aten/testing_util/tensor_util.cpp b/runtime/core/exec_aten/testing_util/tensor_util.cpp
index 0e97c3c245..d1ea069b6e 100644
--- a/runtime/core/exec_aten/testing_util/tensor_util.cpp
+++ b/runtime/core/exec_aten/testing_util/tensor_util.cpp
@@ -16,10 +16,10 @@
 #include <executorch/runtime/core/exec_aten/util/scalar_type_util.h>
 #include <executorch/runtime/core/exec_aten/util/tensor_util.h>
 
-using exec_aten::BFloat16;
-using exec_aten::Half;
-using exec_aten::ScalarType;
-using exec_aten::Tensor;
+using executorch::aten::BFloat16;
+using executorch::aten::Half;
+using executorch::aten::ScalarType;
+using executorch::aten::Tensor;
 
 namespace executorch {
 namespace runtime {
@@ -192,9 +192,9 @@ bool tensor_data_is_close(
 }
 
 bool tensor_lists_are_close(
-    const exec_aten::Tensor* tensors_a,
+    const executorch::aten::Tensor* tensors_a,
     size_t num_tensors_a,
-    const exec_aten::Tensor* tensors_b,
+    const executorch::aten::Tensor* tensors_b,
     size_t num_tensors_b,
     double rtol,
     std::optional<double> opt_atol) {
diff --git a/runtime/core/exec_aten/testing_util/test/tensor_factory_test.cpp b/runtime/core/exec_aten/testing_util/test/tensor_factory_test.cpp
index 8681e9553a..ed8cc00f4e 100644
--- a/runtime/core/exec_aten/testing_util/test/tensor_factory_test.cpp
+++ b/runtime/core/exec_aten/testing_util/test/tensor_factory_test.cpp
@@ -20,12 +20,12 @@
 #endif // USE_ATEN_LIB
 
 using namespace ::testing;
-using exec_aten::ArrayRef;
-using exec_aten::ScalarType;
-using exec_aten::SizesType;
-using exec_aten::StridesType;
-using exec_aten::Tensor;
-using exec_aten::TensorList;
+using executorch::aten::ArrayRef;
+using executorch::aten::ScalarType;
+using executorch::aten::SizesType;
+using executorch::aten::StridesType;
+using executorch::aten::Tensor;
+using executorch::aten::TensorList;
 using executorch::runtime::Error;
 using executorch::runtime::resize_tensor;
 using executorch::runtime::TensorShapeDynamism;
@@ -76,7 +76,7 @@ void resize_tensor_to_assert_dynamic_unbound(Tensor&& t) {
 }
 
 #ifndef USE_ATEN_LIB
-using exec_aten::DimOrderType;
+using executorch::aten::DimOrderType;
 using torch::executor::TensorImpl;
 #endif // !USE_ATEN_LIB
 
@@ -449,7 +449,7 @@ TEST_F(TensorFactoryTest, MakeStridedDataIsCopied) {
 
   // Create two tensors using the same input data and strided vector.
   std::vector<int32_t> data = {1, 2, 3, 4};
-  std::vector<exec_aten::StridesType> strides = {1, 2};
+  std::vector<executorch::aten::StridesType> strides = {1, 2};
   Tensor t1 = tf.make(/*sizes=*/{2, 2}, data, strides);
   Tensor t2 = tf.make(/*sizes=*/{2, 2}, data, strides);
 
@@ -1154,7 +1154,7 @@ TEST_F(TensorFactoryTest, DimOrderToStrideTest) {
   dim_order.resize(2);
   dim_order[0] = 0;
   dim_order[1] = 1;
-  exec_aten::ArrayRef<DimOrderType> dim_order_ref(
+  executorch::aten::ArrayRef<DimOrderType> dim_order_ref(
       dim_order.data(), dim_order.size());
 
   CHECK_ARRAY_REF_EQUAL(dim_order_ref, out.dim_order());
@@ -1164,8 +1164,8 @@ TEST_F(TensorFactoryTest, DimOrderToStrideTest) {
   dim_order[0] = 0;
   dim_order[1] = 1;
   dim_order[2] = 2;
-  dim_order_ref =
-      exec_aten::ArrayRef<DimOrderType>(dim_order.data(), dim_order.size());
+  dim_order_ref = executorch::aten::ArrayRef<DimOrderType>(
+      dim_order.data(), dim_order.size());
 
   CHECK_ARRAY_REF_EQUAL(dim_order_ref, out.dim_order());
 
@@ -1175,8 +1175,8 @@ TEST_F(TensorFactoryTest, DimOrderToStrideTest) {
   dim_order[0] = 0;
   dim_order[1] = 2;
   dim_order[2] = 1;
-  dim_order_ref =
-      exec_aten::ArrayRef<DimOrderType>(dim_order.data(), dim_order.size());
+  dim_order_ref = executorch::aten::ArrayRef<DimOrderType>(
+      dim_order.data(), dim_order.size());
 
   CHECK_ARRAY_REF_EQUAL(dim_order_ref, strided_out.dim_order());
 
@@ -1186,8 +1186,8 @@ TEST_F(TensorFactoryTest, DimOrderToStrideTest) {
   dim_order[0] = 1;
   dim_order[1] = 2;
   dim_order[2] = 0;
-  dim_order_ref =
-      exec_aten::ArrayRef<DimOrderType>(dim_order.data(), dim_order.size());
+  dim_order_ref = executorch::aten::ArrayRef<DimOrderType>(
+      dim_order.data(), dim_order.size());
 
   CHECK_ARRAY_REF_EQUAL(dim_order_ref, strided_out.dim_order());
 }
@@ -1210,8 +1210,8 @@ TEST_F(TensorFactoryTest, AmbgiuousDimOrderToStrideTest) {
   // boundary from strides land to dim order land, we have to resolve
   // such ambiguity in a deterministic way.
   // In dim order land, it is less ambiguous
-  auto dim_order_ref =
-      exec_aten::ArrayRef<DimOrderType>(dim_order.data(), dim_order.size());
+  auto dim_order_ref = executorch::aten::ArrayRef<DimOrderType>(
+      dim_order.data(), dim_order.size());
 
   CHECK_ARRAY_REF_EQUAL(dim_order_ref, strided_out.dim_order());
 
@@ -1220,8 +1220,8 @@ TEST_F(TensorFactoryTest, AmbgiuousDimOrderToStrideTest) {
   dim_order[0] = 2;
   dim_order[1] = 0;
   dim_order[2] = 1;
-  dim_order_ref =
-      exec_aten::ArrayRef<DimOrderType>(dim_order.data(), dim_order.size());
+  dim_order_ref = executorch::aten::ArrayRef<DimOrderType>(
+      dim_order.data(), dim_order.size());
 
   CHECK_ARRAY_REF_EQUAL(dim_order_ref, strided_out.dim_order());
 }
diff --git a/runtime/core/exec_aten/testing_util/test/tensor_util_test.cpp b/runtime/core/exec_aten/testing_util/test/tensor_util_test.cpp
index 948f6bc78f..b8069e2c2b 100644
--- a/runtime/core/exec_aten/testing_util/test/tensor_util_test.cpp
+++ b/runtime/core/exec_aten/testing_util/test/tensor_util_test.cpp
@@ -21,10 +21,10 @@
 #include <gtest/gtest.h>
 
 using namespace ::testing;
-using exec_aten::ScalarType;
-using exec_aten::Tensor;
-using exec_aten::TensorImpl;
-using exec_aten::TensorList;
+using executorch::aten::ScalarType;
+using executorch::aten::Tensor;
+using executorch::aten::TensorImpl;
+using executorch::aten::TensorList;
 using executorch::runtime::testing::IsCloseTo;
 using executorch::runtime::testing::IsDataCloseTo;
 using executorch::runtime::testing::IsDataEqualTo;
diff --git a/runtime/core/exec_aten/util/scalar_type_util.h b/runtime/core/exec_aten/util/scalar_type_util.h
index 29de1f9c39..7c8162b3cd 100644
--- a/runtime/core/exec_aten/util/scalar_type_util.h
+++ b/runtime/core/exec_aten/util/scalar_type_util.h
@@ -52,8 +52,8 @@ using string_view = torch::executor::string_view;
 } // namespace aten
 } // namespace executorch
 #endif // USE_ATEN_LIB
-// DEPRECATED: The exec_aten:: namespace is deprecated. Use executorch::aten::
-// instead.
+// DEPRECATED: The executorch::aten:: namespace is deprecated. Use
+// executorch::aten:: instead.
 namespace exec_aten = ::executorch::aten;
 
 namespace executorch {
@@ -348,9 +348,14 @@ ET_FORALL_SCALAR_TYPES(SPECIALIZE_CppTypeToScalarType)
 
 // In this context, "COMPLEX" means complex types based on primitive C types,
 // which is why ComplexHalf is not included.
-#define ET_FORALL_COMPLEX_TYPES(_)                   \
-  _(::torch::executor::complex<float>, ComplexFloat) \
-  _(::torch::executor::complex<double>, ComplexDouble)
+#define ET_FORALL_COMPLEX_TYPES(_)                    \
+  _(::executorch::aten::complex<float>, ComplexFloat) \
+  _(::executorch::aten::complex<double>, ComplexDouble)
+
+#define ET_FORALL_COMPLEXH_TYPES(_)                                     \
+  _(::executorch::aten::complex<::executorch::aten::Half>, ComplexHalf) \
+  _(::executorch::aten::complex<float>, ComplexFloat)                   \
+  _(::executorch::aten::complex<double>, ComplexDouble)
 
 //
 // Utility functions to retrieve metadata for a given ScalarType
@@ -593,7 +598,7 @@ inline bool isUnderlying(
   return type == ::executorch::runtime::toUnderlying(qtype);
 }
 
-inline ::executorch::aten::ScalarType toRealValueType(
+inline constexpr ::executorch::aten::ScalarType toRealValueType(
     ::executorch::aten::ScalarType t) {
   switch (t) {
     case ::executorch::aten::ScalarType::ComplexHalf:
@@ -607,7 +612,7 @@ inline ::executorch::aten::ScalarType toRealValueType(
   }
 }
 
-inline ::executorch::aten::ScalarType toComplexType(
+inline constexpr ::executorch::aten::ScalarType toComplexType(
     ::executorch::aten::ScalarType t) {
   switch (t) {
     case ::executorch::aten::ScalarType::BFloat16:
@@ -1060,6 +1065,14 @@ struct promote_types {
   ET_INTERNAL_SWITCH_CASE(                                                    \
       ::executorch::aten::ScalarType::ComplexDouble, CTYPE_ALIAS, __VA_ARGS__)
 
+#define ET_INTERNAL_SWITCH_CASE_COMPLEXH_TYPES(CTYPE_ALIAS, ...)              \
+  ET_INTERNAL_SWITCH_CASE(                                                    \
+      ::executorch::aten::ScalarType::ComplexHalf, CTYPE_ALIAS, __VA_ARGS__)  \
+  ET_INTERNAL_SWITCH_CASE(                                                    \
+      ::executorch::aten::ScalarType::ComplexFloat, CTYPE_ALIAS, __VA_ARGS__) \
+  ET_INTERNAL_SWITCH_CASE(                                                    \
+      ::executorch::aten::ScalarType::ComplexDouble, CTYPE_ALIAS, __VA_ARGS__)
+
 #define ET_INTERNAL_SWITCH_CASE_SCALAR_OBJ_TYPES(CTYPE_ALIAS, ...)    \
   ET_INTERNAL_SWITCH_CASE(                                            \
       ::executorch::aten::ScalarType::Bool, CTYPE_ALIAS, __VA_ARGS__) \
@@ -1278,6 +1291,13 @@ struct promote_types {
       NAME,                                                            \
       ET_INTERNAL_SWITCH_CASE_COMPLEX_TYPES(CTYPE_ALIAS, __VA_ARGS__))
 
+#define ET_SWITCH_COMPLEXH_TYPES(TYPE, CONTEXT, NAME, CTYPE_ALIAS, ...) \
+  ET_INTERNAL_SWITCH(                                                   \
+      TYPE,                                                             \
+      CONTEXT,                                                          \
+      NAME,                                                             \
+      ET_INTERNAL_SWITCH_CASE_COMPLEXH_TYPES(CTYPE_ALIAS, __VA_ARGS__))
+
 #define ET_SWITCH_SCALAR_OBJ_TYPES(TYPE, CONTEXT, NAME, CTYPE_ALIAS, ...) \
   ET_INTERNAL_SWITCH(                                                     \
       TYPE,                                                               \
diff --git a/runtime/core/exec_aten/util/tensor_util.h b/runtime/core/exec_aten/util/tensor_util.h
index 6fdc1bc293..d577251f4a 100644
--- a/runtime/core/exec_aten/util/tensor_util.h
+++ b/runtime/core/exec_aten/util/tensor_util.h
@@ -1064,8 +1064,8 @@ template <
     typename FLOAT_T,
     typename std::enable_if<
         std::is_floating_point_v<FLOAT_T> ||
-            std::is_same_v<FLOAT_T, exec_aten::BFloat16> ||
-            std::is_same_v<FLOAT_T, exec_aten::Half>,
+            std::is_same_v<FLOAT_T, executorch::aten::BFloat16> ||
+            std::is_same_v<FLOAT_T, executorch::aten::Half>,
         bool>::type = true>
 bool extract_scalar_tensor(executorch::aten::Tensor tensor, FLOAT_T* out_val) {
   if (tensor.numel() != 1) {
diff --git a/runtime/core/exec_aten/util/tensor_util_aten.cpp b/runtime/core/exec_aten/util/tensor_util_aten.cpp
index 84af8fcee4..d768f66d05 100644
--- a/runtime/core/exec_aten/util/tensor_util_aten.cpp
+++ b/runtime/core/exec_aten/util/tensor_util_aten.cpp
@@ -16,12 +16,12 @@ namespace runtime {
 /**
  * Implementation for ATen tensor util, should only be included in
  * `<target>_aten` target and only be used in ATen mode. Explicitly taking
- * at::Tensor (instead of exec_aten::Tensor) to make sure it fails at compile
- * time if built incorrectly.
+ * at::Tensor (instead of executorch::aten::Tensor) to make sure it fails at
+ * compile time if built incorrectly.
  */
 Error get_dim_order(
     const at::Tensor& tensor,
-    exec_aten::DimOrderType* out_dim_order,
+    executorch::aten::DimOrderType* out_dim_order,
     size_t out_dim_order_size) {
   ET_CHECK_OR_RETURN_ERROR(
       out_dim_order_size == tensor.dim(),
@@ -34,7 +34,7 @@ Error get_dim_order(
 }
 
 bool tensor_has_valid_dim_order(at::Tensor t) {
-  exec_aten::DimOrderType dim_order[kTensorDimensionLimit];
+  executorch::aten::DimOrderType dim_order[kTensorDimensionLimit];
   ET_LOG_MSG_AND_RETURN_IF_FALSE(
       get_dim_order(t, dim_order, t.dim()) == Error::Ok,
       "Failed to retrieve dim order from tensor!");
@@ -54,7 +54,7 @@ bool tensor_has_valid_dim_order(at::Tensor t) {
 }
 
 inline bool tensor_is_default_or_channels_last_dim_order(at::Tensor t) {
-  exec_aten::DimOrderType dim_order[kTensorDimensionLimit];
+  executorch::aten::DimOrderType dim_order[kTensorDimensionLimit];
   ET_LOG_MSG_AND_RETURN_IF_FALSE(
       get_dim_order(t, dim_order, t.dim()) == Error::Ok,
       "Failed to retrieve dim order from tensor!");
@@ -78,13 +78,13 @@ inline bool tensor_is_default_or_channels_last_dim_order(at::Tensor t) {
 }
 
 bool tensors_have_same_dim_order(
-    const exec_aten::ArrayRef<exec_aten::Tensor> tensor_list) {
+    const executorch::aten::ArrayRef<executorch::aten::Tensor> tensor_list) {
   if (tensor_list.size() < 2) {
     return true;
   }
 
-  exec_aten::DimOrderType first_dim_order[kTensorDimensionLimit];
-  exec_aten::DimOrderType other_dim_order[kTensorDimensionLimit];
+  executorch::aten::DimOrderType first_dim_order[kTensorDimensionLimit];
+  executorch::aten::DimOrderType other_dim_order[kTensorDimensionLimit];
 
   ET_LOG_MSG_AND_RETURN_IF_FALSE(
       get_dim_order(tensor_list[0], first_dim_order, tensor_list[0].dim()) ==
@@ -196,7 +196,7 @@ void reset_data_ptr(const at::Tensor& tensor) {
 /// Most callers should use resize_tensor() instead.
 Error resize_tensor_impl(
     c10::TensorImpl* impl,
-    c10::ArrayRef<exec_aten::SizesType> new_sizes) {
+    c10::ArrayRef<executorch::aten::SizesType> new_sizes) {
   // The lean-mode Tensor will perform this check, but at::Tensor won't.
   // Although at::Tensor can be resized in this case, it's not allowed by the
   // higher-level constraints of the runtime.
diff --git a/runtime/core/exec_aten/util/tensor_util_portable.cpp b/runtime/core/exec_aten/util/tensor_util_portable.cpp
index b7ed92f3c9..3350445db7 100644
--- a/runtime/core/exec_aten/util/tensor_util_portable.cpp
+++ b/runtime/core/exec_aten/util/tensor_util_portable.cpp
@@ -18,12 +18,12 @@ namespace runtime {
 /**
  * Implementation for ExecuTorch tensor util, should only be included in
  * an target with ATen mode turned off. Explicitly taking
- * torch::executor::Tensor (instead of exec_aten::Tensor) to make sure it fails
- * at compile time if built incorrectly.
+ * torch::executor::Tensor (instead of executorch::aten::Tensor) to make sure it
+ * fails at compile time if built incorrectly.
  */
 Error get_dim_order(
     const torch::executor::Tensor& tensor,
-    exec_aten::DimOrderType* out_dim_order,
+    executorch::aten::DimOrderType* out_dim_order,
     size_t out_dim_order_size) {
   ET_CHECK_OR_RETURN_ERROR(
       out_dim_order_size == tensor.dim_order().size(),
@@ -34,7 +34,7 @@ Error get_dim_order(
   std::memcpy(
       out_dim_order,
       tensor.dim_order().data(),
-      tensor.dim_order().size() * sizeof(exec_aten::DimOrderType));
+      tensor.dim_order().size() * sizeof(executorch::aten::DimOrderType));
   return Error::Ok;
 }
 
@@ -108,7 +108,7 @@ bool tensor_is_channels_last_dim_order(torch::executor::Tensor t) {
 }
 
 bool tensors_have_same_dim_order(
-    const exec_aten::ArrayRef<exec_aten::Tensor> tensor_list) {
+    const executorch::aten::ArrayRef<executorch::aten::Tensor> tensor_list) {
   if (tensor_list.size() < 2) {
     return true;
   }
@@ -198,15 +198,15 @@ void reset_data_ptr(const torch::executor::Tensor& tensor) {
 class TensorResizerFriend final {
  public:
   ET_NODISCARD static Error resize_tensor_impl(
-      exec_aten::TensorImpl* impl,
-      exec_aten::ArrayRef<exec_aten::SizesType> new_sizes) {
+      executorch::aten::TensorImpl* impl,
+      executorch::aten::ArrayRef<executorch::aten::SizesType> new_sizes) {
     return impl->internal_resize_contiguous(new_sizes);
   }
 };
 
 Error resize_tensor_impl(
     torch::executor::TensorImpl* impl,
-    torch::executor::ArrayRef<exec_aten::SizesType> new_sizes) {
+    torch::executor::ArrayRef<executorch::aten::SizesType> new_sizes) {
   return TensorResizerFriend::resize_tensor_impl(impl, new_sizes);
 }
 } // namespace internal
diff --git a/runtime/core/exec_aten/util/test/dim_order_util_test.cpp b/runtime/core/exec_aten/util/test/dim_order_util_test.cpp
index 6ce611c926..1289773833 100644
--- a/runtime/core/exec_aten/util/test/dim_order_util_test.cpp
+++ b/runtime/core/exec_aten/util/test/dim_order_util_test.cpp
@@ -22,16 +22,16 @@ using executorch::runtime::stride_to_dim_order;
 
 namespace {
 void check_strides_eq(
-    exec_aten::ArrayRef<exec_aten::StridesType> strides_a,
-    exec_aten::ArrayRef<exec_aten::StridesType> strides_b) {
+    executorch::aten::ArrayRef<executorch::aten::StridesType> strides_a,
+    executorch::aten::ArrayRef<executorch::aten::StridesType> strides_b) {
   for (int32_t i = 0; i < strides_a.size(); ++i) {
     EXPECT_EQ(strides_a[i], strides_b[i]);
   }
 }
 
 void check_dim_order_eq(
-    exec_aten::ArrayRef<exec_aten::DimOrderType> dim_order_a,
-    exec_aten::ArrayRef<exec_aten::DimOrderType> dim_order_b) {
+    executorch::aten::ArrayRef<executorch::aten::DimOrderType> dim_order_a,
+    executorch::aten::ArrayRef<executorch::aten::DimOrderType> dim_order_b) {
   for (int32_t i = 0; i < dim_order_a.size(); ++i) {
     EXPECT_EQ(dim_order_a[i], dim_order_b[i]);
   }
@@ -39,18 +39,18 @@ void check_dim_order_eq(
 } // namespace
 
 TEST(DimOrderUtilTest, DimOrderToStride) {
-  exec_aten::SizesType sizes_1[1] = {5};
-  exec_aten::SizesType dim_order_1[1] = {0};
-  exec_aten::SizesType strides_1[1] = {0};
-  exec_aten::SizesType expected_strides_1[1] = {1};
+  executorch::aten::SizesType sizes_1[1] = {5};
+  executorch::aten::SizesType dim_order_1[1] = {0};
+  executorch::aten::SizesType strides_1[1] = {0};
+  executorch::aten::SizesType expected_strides_1[1] = {1};
   auto error = dim_order_to_stride(sizes_1, dim_order_1, 1, strides_1);
   EXPECT_EQ(error, Error::Ok);
   check_strides_eq({strides_1, 1}, {expected_strides_1, 1});
 
-  exec_aten::SizesType sizes_2[2] = {2, 5};
-  exec_aten::SizesType dim_order_2[2] = {0, 1};
-  exec_aten::SizesType strides_2[2] = {0, 0};
-  exec_aten::SizesType expected_strides_2[2] = {5, 1};
+  executorch::aten::SizesType sizes_2[2] = {2, 5};
+  executorch::aten::SizesType dim_order_2[2] = {0, 1};
+  executorch::aten::SizesType strides_2[2] = {0, 0};
+  executorch::aten::SizesType expected_strides_2[2] = {5, 1};
   error = dim_order_to_stride(sizes_2, dim_order_2, 2, strides_2);
   EXPECT_EQ(error, Error::Ok);
   check_strides_eq({strides_2, 2}, {expected_strides_2, 2});
@@ -63,10 +63,10 @@ TEST(DimOrderUtilTest, DimOrderToStride) {
   EXPECT_EQ(error, Error::Ok);
   check_strides_eq({strides_2, 2}, {expected_strides_2, 2});
 
-  exec_aten::SizesType sizes_3[3] = {2, 5, 7};
-  exec_aten::SizesType dim_order_3[3] = {0, 1, 2};
-  exec_aten::SizesType strides_3[3] = {0, 0, 0};
-  exec_aten::SizesType expected_strides_3[3] = {35, 7, 1};
+  executorch::aten::SizesType sizes_3[3] = {2, 5, 7};
+  executorch::aten::SizesType dim_order_3[3] = {0, 1, 2};
+  executorch::aten::SizesType strides_3[3] = {0, 0, 0};
+  executorch::aten::SizesType expected_strides_3[3] = {35, 7, 1};
   error = dim_order_to_stride(sizes_3, dim_order_3, 3, strides_3);
   EXPECT_EQ(error, Error::Ok);
   check_strides_eq({strides_3, 3}, {expected_strides_3, 3});
@@ -90,10 +90,10 @@ TEST(DimOrderUtilTest, DimOrderToStride) {
   EXPECT_EQ(error, Error::Ok);
   check_strides_eq({strides_3, 3}, {expected_strides_3, 3});
 
-  exec_aten::SizesType sizes_4[4] = {2, 5, 7, 8};
-  exec_aten::SizesType dim_order_4[4] = {0, 1, 2, 3};
-  exec_aten::SizesType strides_4[4] = {0, 0, 0, 0};
-  exec_aten::SizesType expected_strides_4[4] = {280, 56, 8, 1};
+  executorch::aten::SizesType sizes_4[4] = {2, 5, 7, 8};
+  executorch::aten::SizesType dim_order_4[4] = {0, 1, 2, 3};
+  executorch::aten::SizesType strides_4[4] = {0, 0, 0, 0};
+  executorch::aten::SizesType expected_strides_4[4] = {280, 56, 8, 1};
   error = dim_order_to_stride(sizes_4, dim_order_4, 4, strides_4);
   EXPECT_EQ(error, Error::Ok);
   check_strides_eq({strides_4, 4}, {expected_strides_4, 4});
@@ -128,10 +128,10 @@ TEST(DimOrderUtilTest, DimOrderToStride) {
   EXPECT_EQ(error, Error::Ok);
   check_strides_eq({strides_4, 4}, {expected_strides_4, 4});
 
-  exec_aten::SizesType sizes_5[5] = {2, 5, 7, 8, 9};
-  exec_aten::SizesType dim_order_5[5] = {0, 1, 2, 3, 4};
-  exec_aten::SizesType strides_5[5] = {0, 0, 0, 0, 0};
-  exec_aten::SizesType expected_strides_5[5] = {2520, 504, 72, 9, 1};
+  executorch::aten::SizesType sizes_5[5] = {2, 5, 7, 8, 9};
+  executorch::aten::SizesType dim_order_5[5] = {0, 1, 2, 3, 4};
+  executorch::aten::SizesType strides_5[5] = {0, 0, 0, 0, 0};
+  executorch::aten::SizesType expected_strides_5[5] = {2520, 504, 72, 9, 1};
   error = dim_order_to_stride(sizes_5, dim_order_5, 5, strides_5);
   EXPECT_EQ(error, Error::Ok);
   check_strides_eq({strides_5, 5}, {expected_strides_5, 5});
@@ -171,10 +171,10 @@ TEST(DimOrderUtilTest, DimOrderToStride) {
   check_strides_eq({strides_5, 5}, {expected_strides_5, 5});
 
   // Check 0 sized dims
-  exec_aten::SizesType sizes_3_zero[3] = {2, 5, 0};
-  exec_aten::SizesType dim_order_3_zero[3] = {0, 1, 2};
-  exec_aten::SizesType strides_3_zero[3] = {0, 0, 0};
-  exec_aten::SizesType expected_strides_3_zero[3] = {5, 1, 1};
+  executorch::aten::SizesType sizes_3_zero[3] = {2, 5, 0};
+  executorch::aten::SizesType dim_order_3_zero[3] = {0, 1, 2};
+  executorch::aten::SizesType strides_3_zero[3] = {0, 0, 0};
+  executorch::aten::SizesType expected_strides_3_zero[3] = {5, 1, 1};
   error =
       dim_order_to_stride(sizes_3_zero, dim_order_3_zero, 3, strides_3_zero);
   EXPECT_EQ(error, Error::Ok);
@@ -204,31 +204,31 @@ TEST(DimOrderUtilTest, DimOrderToStride) {
 }
 
 TEST(DimOrderUtilTest, StrideToDimOrder) {
-  exec_aten::SizesType strides[3] = {5, 1, 15};
-  exec_aten::DimOrderType dim_order[3] = {0, 0, 0};
+  executorch::aten::SizesType strides[3] = {5, 1, 15};
+  executorch::aten::DimOrderType dim_order[3] = {0, 0, 0};
 
   auto error = stride_to_dim_order(strides, 3, dim_order);
 
   EXPECT_EQ(error, Error::Ok);
 
-  exec_aten::DimOrderType expected_dim_order[3] = {2, 0, 1};
+  executorch::aten::DimOrderType expected_dim_order[3] = {2, 0, 1};
   check_dim_order_eq(dim_order, expected_dim_order);
 }
 
 TEST(DimOrderUtilTest, StrideToDimOrderSameStrides) {
-  exec_aten::SizesType strides[4] = {4, 3, 1, 1};
-  exec_aten::DimOrderType dim_order[4] = {0, 0, 0, 0};
+  executorch::aten::SizesType strides[4] = {4, 3, 1, 1};
+  executorch::aten::DimOrderType dim_order[4] = {0, 0, 0, 0};
 
   auto error = stride_to_dim_order(strides, 4, dim_order);
   EXPECT_EQ(error, Error::Ok);
 
-  exec_aten::DimOrderType expected_dim_order[4] = {0, 1, 2, 3};
+  executorch::aten::DimOrderType expected_dim_order[4] = {0, 1, 2, 3};
   check_dim_order_eq(dim_order, expected_dim_order);
 }
 
 TEST(DimOrderUtilTest, IsDefaultDimOrderTest) {
   for (int i = 1; i < 7; ++i) {
-    std::vector<exec_aten::DimOrderType> dim_order(i);
+    std::vector<executorch::aten::DimOrderType> dim_order(i);
     std::iota(dim_order.begin(), dim_order.end(), 0);
 
     EXPECT_TRUE(is_contiguous_dim_order(dim_order.data(), dim_order.size()));
@@ -242,7 +242,7 @@ TEST(DimOrderUtilTest, IsDefaultDimOrderTest) {
 TEST(DimOrderUtilTest, IsDefaultDimOrderFailCasesTest) {
   // Dims is default order but have two elements swapped
   for (int i = 3; i < 8; ++i) {
-    std::vector<exec_aten::DimOrderType> dim_order(i);
+    std::vector<executorch::aten::DimOrderType> dim_order(i);
     std::iota(dim_order.begin(), dim_order.end(), 0);
     std::swap(dim_order[0], dim_order[1]);
 
@@ -251,7 +251,7 @@ TEST(DimOrderUtilTest, IsDefaultDimOrderFailCasesTest) {
 
   // Dims is default order but shifted by 1
   for (int i = 3; i < 8; ++i) {
-    std::vector<exec_aten::DimOrderType> dim_order(i);
+    std::vector<executorch::aten::DimOrderType> dim_order(i);
     for (int d = 0; d < i; ++d) {
       dim_order[d] = (d + 1) % i;
     }
@@ -261,8 +261,8 @@ TEST(DimOrderUtilTest, IsDefaultDimOrderFailCasesTest) {
 }
 
 TEST(DimOrderUtilTest, IsChannelsLastDimOrderTest) {
-  exec_aten::DimOrderType dim_order_4d[4] = {0, 2, 3, 1};
-  exec_aten::DimOrderType dim_order_5d[5] = {0, 2, 3, 4, 1};
+  executorch::aten::DimOrderType dim_order_4d[4] = {0, 2, 3, 1};
+  executorch::aten::DimOrderType dim_order_5d[5] = {0, 2, 3, 4, 1};
 
   EXPECT_TRUE(is_channels_last_dim_order(dim_order_4d, 4));
   EXPECT_TRUE(is_channels_last_dim_order(dim_order_5d, 5));
@@ -274,14 +274,14 @@ TEST(DimOrderUtilTest, IsChannelsLastDimOrderTest) {
 
 TEST(DimOrderUtilTest, IsChannelsLastDimOrderFailCasesTest) {
   // Non 4D and 5D dim order returns false
-  exec_aten::DimOrderType dim_order_3d[4] = {1, 2, 0};
-  exec_aten::DimOrderType dim_order_6d[6] = {0, 2, 3, 4, 5, 1};
+  executorch::aten::DimOrderType dim_order_3d[4] = {1, 2, 0};
+  executorch::aten::DimOrderType dim_order_6d[6] = {0, 2, 3, 4, 5, 1};
 
   EXPECT_FALSE(is_channels_last_dim_order(dim_order_3d, 3));
   EXPECT_FALSE(is_channels_last_dim_order(dim_order_6d, 6));
 
-  exec_aten::DimOrderType dim_order_4d[4] = {0, 3, 2, 1};
-  exec_aten::DimOrderType dim_order_5d[5] = {4, 3, 2, 0, 1};
+  executorch::aten::DimOrderType dim_order_4d[4] = {0, 3, 2, 1};
+  executorch::aten::DimOrderType dim_order_5d[5] = {4, 3, 2, 0, 1};
 
   EXPECT_FALSE(is_channels_last_dim_order(dim_order_4d, 4));
   EXPECT_FALSE(is_channels_last_dim_order(dim_order_5d, 5));
diff --git a/runtime/core/exec_aten/util/test/operator_impl_example_test.cpp b/runtime/core/exec_aten/util/test/operator_impl_example_test.cpp
index 43d0e7943b..7975720b57 100644
--- a/runtime/core/exec_aten/util/test/operator_impl_example_test.cpp
+++ b/runtime/core/exec_aten/util/test/operator_impl_example_test.cpp
@@ -21,8 +21,8 @@
 #include <gtest/gtest.h>
 
 using namespace ::testing;
-using exec_aten::ScalarType;
-using exec_aten::Tensor;
+using executorch::aten::ScalarType;
+using executorch::aten::Tensor;
 using executorch::runtime::testing::TensorFactory;
 
 //
diff --git a/runtime/core/exec_aten/util/test/scalar_type_util_test.cpp b/runtime/core/exec_aten/util/test/scalar_type_util_test.cpp
index cd18be977c..e8849473a0 100644
--- a/runtime/core/exec_aten/util/test/scalar_type_util_test.cpp
+++ b/runtime/core/exec_aten/util/test/scalar_type_util_test.cpp
@@ -15,8 +15,8 @@
 #include <gtest/gtest.h>
 
 using namespace ::testing;
-using exec_aten::complex;
-using exec_aten::ScalarType;
+using executorch::aten::complex;
+using executorch::aten::ScalarType;
 using executorch::runtime::elementSize;
 using executorch::runtime::isValid;
 using executorch::runtime::toString;
@@ -48,20 +48,20 @@ TEST(ScalarTypeUtilTest, ElementSize) {
       {ScalarType::Short, sizeof(int16_t)},
       {ScalarType::Int, sizeof(int32_t)},
       {ScalarType::Long, sizeof(int64_t)},
-      {ScalarType::Half, sizeof(exec_aten::Half)},
+      {ScalarType::Half, sizeof(executorch::aten::Half)},
       {ScalarType::Float, sizeof(float)},
       {ScalarType::Double, sizeof(double)},
       {ScalarType::ComplexHalf,
-       sizeof(::exec_aten::complex<::exec_aten::Half>)},
-      {ScalarType::ComplexFloat, sizeof(::exec_aten::complex<float>)},
-      {ScalarType::ComplexDouble, sizeof(::exec_aten::complex<double>)},
+       sizeof(::executorch::aten::complex<::executorch::aten::Half>)},
+      {ScalarType::ComplexFloat, sizeof(::executorch::aten::complex<float>)},
+      {ScalarType::ComplexDouble, sizeof(::executorch::aten::complex<double>)},
       {ScalarType::Bool, sizeof(bool)},
-      {ScalarType::QInt8, sizeof(::exec_aten::qint8)},
-      {ScalarType::QUInt8, sizeof(::exec_aten::quint8)},
-      {ScalarType::QInt32, sizeof(::exec_aten::qint32)},
-      {ScalarType::BFloat16, sizeof(::exec_aten::BFloat16)},
-      {ScalarType::QUInt4x2, sizeof(::exec_aten::quint4x2)},
-      {ScalarType::QUInt2x4, sizeof(::exec_aten::quint2x4)},
+      {ScalarType::QInt8, sizeof(::executorch::aten::qint8)},
+      {ScalarType::QUInt8, sizeof(::executorch::aten::quint8)},
+      {ScalarType::QInt32, sizeof(::executorch::aten::qint32)},
+      {ScalarType::BFloat16, sizeof(::executorch::aten::BFloat16)},
+      {ScalarType::QUInt4x2, sizeof(::executorch::aten::quint4x2)},
+      {ScalarType::QUInt2x4, sizeof(::executorch::aten::quint2x4)},
   };
   for (const auto& test_case : test_cases) {
     EXPECT_EQ(elementSize(test_case.type), test_case.expected_size);
@@ -101,7 +101,7 @@ TEST(ScalarTypeUtilTest, UnknownTypeElementSizeDies) {
 }
 
 TEST(ScalarTypeUtilTest, canCastTest) {
-  using exec_aten::ScalarType;
+  using executorch::aten::ScalarType;
   using executorch::runtime::canCast;
 
   // Check some common cases
@@ -134,7 +134,7 @@ TEST(ScalarTypeUtilTest, canCastTest) {
 }
 
 TEST(ScalarTypeUtilTest, promoteTypesTest) {
-  using exec_aten::ScalarType;
+  using executorch::aten::ScalarType;
   using executorch::runtime::promoteTypes;
 
   // Check some common cases
diff --git a/runtime/core/exec_aten/util/test/tensor_util_test.cpp b/runtime/core/exec_aten/util/test/tensor_util_test.cpp
index 88588dade6..7d30b0bbdb 100644
--- a/runtime/core/exec_aten/util/test/tensor_util_test.cpp
+++ b/runtime/core/exec_aten/util/test/tensor_util_test.cpp
@@ -15,8 +15,8 @@
 #include <limits>
 
 using namespace ::testing;
-using exec_aten::ScalarType;
-using exec_aten::Tensor;
+using executorch::aten::ScalarType;
+using executorch::aten::Tensor;
 using executorch::runtime::extract_scalar_tensor;
 using executorch::runtime::testing::TensorFactory;
 
diff --git a/runtime/core/hierarchical_allocator.h b/runtime/core/hierarchical_allocator.h
index b5ba8b0446..f2f5fd18fb 100644
--- a/runtime/core/hierarchical_allocator.h
+++ b/runtime/core/hierarchical_allocator.h
@@ -62,14 +62,16 @@ class HierarchicalAllocator final {
     ET_CHECK_OR_RETURN_ERROR(
         memory_id < buffers_.size(),
         InvalidArgument,
-        "id %" PRIu32 " >= %zu",
+        "id %" PRIu32 " >= %" ET_PRIsize_t,
         memory_id,
         buffers_.size());
     Span<uint8_t> buffer = buffers_[memory_id];
     ET_CHECK_OR_RETURN_ERROR(
         offset_bytes + size_bytes <= buffer.size(),
         MemoryAllocationFailed,
-        "offset_bytes (%zu) + size_bytes (%zu) >= allocator size (%zu) "
+        "offset_bytes (%" ET_PRIsize_t ") + size_bytes (%" ET_PRIsize_t
+        ") >= allocator size (%" ET_PRIsize_t
+        ") "
         "for memory_id %" PRIu32,
         offset_bytes,
         size_bytes,
diff --git a/runtime/core/targets.bzl b/runtime/core/targets.bzl
index 61351b083b..c3535688f6 100644
--- a/runtime/core/targets.bzl
+++ b/runtime/core/targets.bzl
@@ -41,7 +41,6 @@ def define_common_targets():
             "defines.h",
             "error.h",
             "freeable_buffer.h",
-            "named_data_map.h",
             "result.h",
             "span.h",
         ],
@@ -133,6 +132,20 @@ def define_common_targets():
         ],
     )
 
+    runtime.cxx_library(
+        name = "named_data_map",
+        exported_headers = [
+            "named_data_map.h",
+        ],
+        visibility = [
+            "//executorch/...",
+            "@EXECUTORCH_CLIENTS",
+        ],
+        exported_deps = [
+            ":tensor_layout",
+        ],
+    )
+    
     runtime.cxx_library(
         name = "tensor_layout",
         srcs = ["tensor_layout.cpp"],
diff --git a/runtime/core/tensor_layout.cpp b/runtime/core/tensor_layout.cpp
index 4dad23dbd1..748a43fc5d 100644
--- a/runtime/core/tensor_layout.cpp
+++ b/runtime/core/tensor_layout.cpp
@@ -17,7 +17,7 @@ namespace runtime {
 namespace {
 Result<size_t> calculate_nbytes(
     const Span<const int32_t>& sizes,
-    const exec_aten::ScalarType& scalar_type) {
+    const executorch::aten::ScalarType& scalar_type) {
   ssize_t n = 1;
   for (ssize_t i = 0; i < sizes.size(); i++) {
     if (sizes[i] < 0) {
@@ -30,7 +30,7 @@ Result<size_t> calculate_nbytes(
 }
 } // namespace
 
-Result<TensorLayout> TensorLayout::create(
+Result<const TensorLayout> TensorLayout::create(
     Span<const int32_t> sizes,
     Span<const uint8_t> dim_order,
     executorch::aten::ScalarType scalar_type) {
diff --git a/runtime/core/tensor_layout.h b/runtime/core/tensor_layout.h
index dcd0a2af83..c2c3833f52 100644
--- a/runtime/core/tensor_layout.h
+++ b/runtime/core/tensor_layout.h
@@ -33,7 +33,7 @@ class ET_EXPERIMENTAL TensorLayout final {
    * @param[in] scalar_type The scalar type of the tensor.
    * @return A Result containing the TensorLayout on success, or an error.
    */
-  static executorch::runtime::Result<TensorLayout> create(
+  static executorch::runtime::Result<const TensorLayout> create(
       Span<const int32_t> sizes,
       Span<const uint8_t> dim_order,
       executorch::aten::ScalarType scalar_type);
@@ -77,16 +77,16 @@ class ET_EXPERIMENTAL TensorLayout final {
         scalar_type_(scalar_type),
         nbytes_(nbytes) {}
   /// The sizes of the tensor.
-  Span<const int32_t> sizes_;
+  const Span<const int32_t> sizes_;
 
   /// The dim order of the tensor.
-  Span<const uint8_t> dim_order_;
+  const Span<const uint8_t> dim_order_;
 
   /// The scalar type of the tensor.
-  executorch::aten::ScalarType scalar_type_;
+  const executorch::aten::ScalarType scalar_type_;
 
   /// The size in bytes of the tensor.
-  size_t nbytes_;
+  const size_t nbytes_;
 };
 
 } // namespace runtime
diff --git a/runtime/core/test/evalue_test.cpp b/runtime/core/test/evalue_test.cpp
index a7eb5c185a..996dc187dc 100644
--- a/runtime/core/test/evalue_test.cpp
+++ b/runtime/core/test/evalue_test.cpp
@@ -16,7 +16,7 @@
 
 using namespace ::testing;
 
-using exec_aten::ScalarType;
+using executorch::aten::ScalarType;
 using executorch::runtime::BoxedEvalueList;
 using executorch::runtime::EValue;
 using executorch::runtime::Tag;
@@ -36,14 +36,15 @@ class EValueTest : public ::testing::Test {
 // behavior of smart pointers.
 class TensorWrapper {
  public:
-  explicit TensorWrapper(exec_aten::Tensor tensor)
-      : tensor_(std::make_unique<exec_aten::Tensor>(std::move(tensor))) {}
+  explicit TensorWrapper(executorch::aten::Tensor tensor)
+      : tensor_(std::make_unique<executorch::aten::Tensor>(std::move(tensor))) {
+  }
 
-  exec_aten::Tensor& operator*() {
+  executorch::aten::Tensor& operator*() {
     return *tensor_;
   }
 
-  const exec_aten::Tensor& operator*() const {
+  const executorch::aten::Tensor& operator*() const {
     return *tensor_;
   }
 
@@ -60,7 +61,7 @@ class TensorWrapper {
   }
 
  private:
-  std::unique_ptr<exec_aten::Tensor> tensor_;
+  std::unique_ptr<executorch::aten::Tensor> tensor_;
 };
 
 TEST_F(EValueTest, CopyTrivialType) {
@@ -101,7 +102,7 @@ TEST_F(EValueTest, ToOptionalInt) {
   EXPECT_TRUE(e.isInt());
   EXPECT_FALSE(e.isNone());
 
-  exec_aten::optional<int64_t> o = e.toOptional<int64_t>();
+  executorch::aten::optional<int64_t> o = e.toOptional<int64_t>();
   EXPECT_TRUE(o.has_value());
   EXPECT_EQ(o.value(), 5);
 }
@@ -110,28 +111,29 @@ TEST_F(EValueTest, NoneToOptionalInt) {
   EValue e;
   EXPECT_TRUE(e.isNone());
 
-  exec_aten::optional<int64_t> o = e.toOptional<int64_t>();
+  executorch::aten::optional<int64_t> o = e.toOptional<int64_t>();
   EXPECT_FALSE(o.has_value());
 }
 
 TEST_F(EValueTest, ToOptionalScalar) {
-  exec_aten::Scalar s((double)3.141);
+  executorch::aten::Scalar s((double)3.141);
   EValue e(s);
   EXPECT_TRUE(e.isScalar());
   EXPECT_FALSE(e.isNone());
 
-  exec_aten::optional<exec_aten::Scalar> o = e.toOptional<exec_aten::Scalar>();
+  executorch::aten::optional<executorch::aten::Scalar> o =
+      e.toOptional<executorch::aten::Scalar>();
   EXPECT_TRUE(o.has_value());
   EXPECT_TRUE(o.value().isFloatingPoint());
   EXPECT_EQ(o.value().to<double>(), 3.141);
 }
 
 TEST_F(EValueTest, ScalarToType) {
-  exec_aten::Scalar s_d((double)3.141);
+  executorch::aten::Scalar s_d((double)3.141);
   EXPECT_EQ(s_d.to<double>(), 3.141);
-  exec_aten::Scalar s_i((int64_t)3);
+  executorch::aten::Scalar s_i((int64_t)3);
   EXPECT_EQ(s_i.to<int64_t>(), 3);
-  exec_aten::Scalar s_b(true);
+  executorch::aten::Scalar s_b(true);
   EXPECT_EQ(s_b.to<bool>(), true);
 }
 
@@ -139,7 +141,8 @@ TEST_F(EValueTest, NoneToOptionalScalar) {
   EValue e;
   EXPECT_TRUE(e.isNone());
 
-  exec_aten::optional<exec_aten::Scalar> o = e.toOptional<exec_aten::Scalar>();
+  executorch::aten::optional<executorch::aten::Scalar> o =
+      e.toOptional<executorch::aten::Scalar>();
   EXPECT_FALSE(o.has_value());
 }
 
@@ -147,18 +150,19 @@ TEST_F(EValueTest, NoneToOptionalTensor) {
   EValue e;
   EXPECT_TRUE(e.isNone());
 
-  exec_aten::optional<exec_aten::Tensor> o = e.toOptional<exec_aten::Tensor>();
+  executorch::aten::optional<executorch::aten::Tensor> o =
+      e.toOptional<executorch::aten::Tensor>();
   EXPECT_FALSE(o.has_value());
 }
 
 TEST_F(EValueTest, ToScalarType) {
   EValue e((int64_t)4);
   auto o = e.toScalarType();
-  EXPECT_EQ(o, exec_aten::ScalarType::Long);
+  EXPECT_EQ(o, executorch::aten::ScalarType::Long);
   EValue f((int64_t)4);
-  auto o2 = e.toOptional<exec_aten::ScalarType>();
+  auto o2 = e.toOptional<executorch::aten::ScalarType>();
   EXPECT_TRUE(o2.has_value());
-  EXPECT_EQ(o2.value(), exec_aten::ScalarType::Long);
+  EXPECT_EQ(o2.value(), executorch::aten::ScalarType::Long);
 }
 
 TEST_F(EValueTest, toString) {
@@ -166,28 +170,29 @@ TEST_F(EValueTest, toString) {
   EXPECT_TRUE(e.isString());
   EXPECT_FALSE(e.isNone());
 
-  exec_aten::string_view x = e.toString();
+  executorch::aten::string_view x = e.toString();
   EXPECT_EQ(x, "foo");
 }
 
 TEST_F(EValueTest, MemoryFormat) {
   const EValue e((int64_t)0);
   EXPECT_TRUE(e.isInt());
-  const exec_aten::MemoryFormat m = e.to<exec_aten::MemoryFormat>();
-  EXPECT_EQ(m, exec_aten::MemoryFormat::Contiguous);
+  const executorch::aten::MemoryFormat m =
+      e.to<executorch::aten::MemoryFormat>();
+  EXPECT_EQ(m, executorch::aten::MemoryFormat::Contiguous);
 }
 
 TEST_F(EValueTest, Layout) {
   const EValue e((int64_t)0);
   EXPECT_TRUE(e.isInt());
-  const exec_aten::Layout l = e.to<exec_aten::Layout>();
-  EXPECT_EQ(l, exec_aten::Layout::Strided);
+  const executorch::aten::Layout l = e.to<executorch::aten::Layout>();
+  EXPECT_EQ(l, executorch::aten::Layout::Strided);
 }
 
 TEST_F(EValueTest, Device) {
   const EValue e((int64_t)0);
   EXPECT_TRUE(e.isInt());
-  const exec_aten::Device d = e.to<exec_aten::Device>();
+  const executorch::aten::Device d = e.to<executorch::aten::Device>();
   EXPECT_TRUE(d.is_cpu());
 }
 
@@ -211,9 +216,9 @@ TEST_F(EValueTest, toOptionalTensorList) {
   // create list, empty evalue ctor gets tag::None
   EValue values[2] = {EValue(), EValue()};
   EValue* values_p[2] = {&values[0], &values[1]};
-  exec_aten::optional<exec_aten::Tensor> storage[2];
+  executorch::aten::optional<executorch::aten::Tensor> storage[2];
   // wrap in array ref
-  BoxedEvalueList<exec_aten::optional<exec_aten::Tensor>> a(
+  BoxedEvalueList<executorch::aten::optional<executorch::aten::Tensor>> a(
       values_p, storage, 2);
 
   // create Evalue
@@ -222,8 +227,9 @@ TEST_F(EValueTest, toOptionalTensorList) {
   EXPECT_TRUE(e.isListOptionalTensor());
 
   // Convert back to list
-  exec_aten::ArrayRef<exec_aten::optional<exec_aten::Tensor>> x =
-      e.toListOptionalTensor();
+  executorch::aten::ArrayRef<
+      executorch::aten::optional<executorch::aten::Tensor>>
+      x = e.toListOptionalTensor();
   EXPECT_EQ(x.size(), 2);
   EXPECT_FALSE(x[0].has_value());
   EXPECT_FALSE(x[1].has_value());
@@ -231,7 +237,7 @@ TEST_F(EValueTest, toOptionalTensorList) {
 
 TEST_F(EValueTest, ConstructFromUniquePtr) {
   TensorFactory<ScalarType::Float> tf;
-  auto tensor_ptr = std::make_unique<exec_aten::Tensor>(tf.ones({2, 3}));
+  auto tensor_ptr = std::make_unique<executorch::aten::Tensor>(tf.ones({2, 3}));
 
   EValue evalue(std::move(tensor_ptr));
 
@@ -239,7 +245,7 @@ TEST_F(EValueTest, ConstructFromUniquePtr) {
   EXPECT_EQ(evalue.toTensor().dim(), 2);
   EXPECT_EQ(evalue.toTensor().numel(), 6);
 
-  EValue evalue2(std::make_unique<exec_aten::Tensor>(tf.ones({4, 5})));
+  EValue evalue2(std::make_unique<executorch::aten::Tensor>(tf.ones({4, 5})));
 
   EXPECT_TRUE(evalue2.isTensor());
   EXPECT_EQ(evalue2.toTensor().dim(), 2);
@@ -248,7 +254,7 @@ TEST_F(EValueTest, ConstructFromUniquePtr) {
 
 TEST_F(EValueTest, ConstructFromSharedPtr) {
   TensorFactory<ScalarType::Float> tf;
-  auto tensor_ptr = std::make_shared<exec_aten::Tensor>(tf.ones({4, 5}));
+  auto tensor_ptr = std::make_shared<executorch::aten::Tensor>(tf.ones({4, 5}));
 
   EValue evalue(tensor_ptr);
 
@@ -269,7 +275,7 @@ TEST_F(EValueTest, ConstructFromTensorWrapper) {
 }
 
 TEST_F(EValueTest, ConstructFromNullPtrAborts) {
-  std::unique_ptr<exec_aten::Tensor> null_ptr;
+  std::unique_ptr<executorch::aten::Tensor> null_ptr;
 
   ET_EXPECT_DEATH({ EValue evalue(null_ptr); }, "");
 }
diff --git a/runtime/core/test/event_tracer_test.cpp b/runtime/core/test/event_tracer_test.cpp
index 6422f9b668..622de1ff9f 100644
--- a/runtime/core/test/event_tracer_test.cpp
+++ b/runtime/core/test/event_tracer_test.cpp
@@ -16,7 +16,7 @@
 #include <executorch/runtime/core/event_tracer_hooks.h>
 #include <executorch/runtime/core/event_tracer_hooks_delegate.h>
 
-using exec_aten::Tensor;
+using executorch::aten::Tensor;
 using executorch::runtime::AllocatorID;
 using executorch::runtime::ArrayRef;
 using executorch::runtime::ChainID;
diff --git a/runtime/core/test/tensor_layout_test.cpp b/runtime/core/test/tensor_layout_test.cpp
index 3e032788ea..0039745f77 100644
--- a/runtime/core/test/tensor_layout_test.cpp
+++ b/runtime/core/test/tensor_layout_test.cpp
@@ -26,7 +26,7 @@ TEST(TestTensorLayout, Ctor) {
   Span<const int32_t> sizes_span = {sizes.data(), sizes.size()};
   Span<const uint8_t> dim_order_span = {dim_order.data(), dim_order.size()};
 
-  Result<TensorLayout> layout_res =
+  Result<const TensorLayout> layout_res =
       TensorLayout::create(sizes_span, dim_order_span, ScalarType::Float);
   EXPECT_TRUE(layout_res.ok());
 
@@ -50,7 +50,7 @@ TEST(TestTensorLayout, Ctor_InvalidDimOrder) {
   Span<const int32_t> sizes_span = {sizes.data(), sizes.size()};
   Span<const uint8_t> dim_order_span = {dim_order.data(), dim_order.size()};
 
-  Result<TensorLayout> layout_res =
+  Result<const TensorLayout> layout_res =
       TensorLayout::create(sizes_span, dim_order_span, ScalarType::Float);
   EXPECT_EQ(layout_res.error(), Error::InvalidArgument);
 }
@@ -61,7 +61,7 @@ TEST(TestTensorLayout, Ctor_InvalidSizes) {
   Span<const int32_t> sizes_span = {sizes.data(), sizes.size()};
   Span<const uint8_t> dim_order_span = {dim_order.data(), dim_order.size()};
 
-  Result<TensorLayout> layout_res =
+  Result<const TensorLayout> layout_res =
       TensorLayout::create(sizes_span, dim_order_span, ScalarType::Float);
   EXPECT_EQ(layout_res.error(), Error::InvalidArgument);
 }
@@ -72,7 +72,7 @@ TEST(TestTensorLayout, Ctor_SizesDimOrderMismatch) {
   Span<const int32_t> sizes_span = {sizes.data(), sizes.size()};
   Span<const uint8_t> dim_order_span = {dim_order.data(), dim_order.size()};
 
-  Result<TensorLayout> layout_res =
+  Result<const TensorLayout> layout_res =
       TensorLayout::create(sizes_span, dim_order_span, ScalarType::Float);
   EXPECT_EQ(layout_res.error(), Error::InvalidArgument);
 }
diff --git a/runtime/executor/method.cpp b/runtime/executor/method.cpp
index 3edf788a27..c6fe98abcc 100644
--- a/runtime/executor/method.cpp
+++ b/runtime/executor/method.cpp
@@ -24,6 +24,7 @@
 #include <executorch/runtime/kernel/kernel_runtime_context.h>
 #include <executorch/runtime/kernel/operator_registry.h>
 #include <executorch/runtime/platform/assert.h>
+#include <executorch/runtime/platform/compiler.h>
 #include <executorch/runtime/platform/log.h>
 #include <executorch/runtime/platform/profiler.h>
 #include <executorch/schema/program_generated.h>
@@ -238,7 +239,7 @@ Result<InstructionArgs> gen_instruction_arguments(
     ET_CHECK_OR_RETURN_ERROR(
         arg_idx < num_values,
         InvalidProgram,
-        "Arg index %d >= %zu",
+        "Arg index %d >= %" ET_PRIsize_t,
         arg_idx,
         num_values);
     arg_list[i] = &values[arg_idx];
@@ -254,16 +255,16 @@ Result<bool> parse_cond_value(const EValue& cond_value) {
   // is a Bool Scalar which resolves to false and points us to the instruction
   // to jump to which will take us to a point that is after the else branch.
   if (cond_value.isTensor()) {
-    const exec_aten::Tensor& cond_val = cond_value.toTensor();
+    const executorch::aten::Tensor& cond_val = cond_value.toTensor();
 
     // All the tensors and scalar cond values should be of bool type
     // currently. If that's not the case then something is wrong in the model
     // and we should exit.
     ET_CHECK_OR_RETURN_ERROR(
-        exec_aten::ScalarType::Bool == cond_val.scalar_type(),
+        executorch::aten::ScalarType::Bool == cond_val.scalar_type(),
         InvalidProgram,
         "Expected dtype of %" PRId8 " got %" PRId8,
-        static_cast<int8_t>(exec_aten::ScalarType::Bool),
+        static_cast<int8_t>(executorch::aten::ScalarType::Bool),
         static_cast<int8_t>(cond_val.scalar_type()));
 
     const bool* cond_data = cond_val.const_data_ptr<bool>();
@@ -311,7 +312,7 @@ Error Method::parse_values() {
                  executorch_flatbuffer::KernelTypes::Null ||
              serialization_value->val() != nullptr),
         InvalidProgram,
-        "Null value at index %zu",
+        "Null value at index %" ET_PRIsize_t,
         i);
 
     const auto val = serialization_value->val();
@@ -340,7 +341,10 @@ Error Method::parse_values() {
         const auto items =
             static_cast<const executorch_flatbuffer::IntList*>(val)->items();
         ET_CHECK_OR_RETURN_ERROR(
-            items != nullptr, InvalidProgram, "Missing list at index %zu", i);
+            items != nullptr,
+            InvalidProgram,
+            "Missing list at index %" ET_PRIsize_t,
+            i);
         // Allocate space for boxed and unboxed list representations using
         // values_ as source of truth
         auto* evalp_list =
@@ -356,7 +360,8 @@ Error Method::parse_values() {
           ET_CHECK_OR_RETURN_ERROR(
               value_index >= 0 && value_index < n_value,
               InvalidProgram,
-              "Invalid value index %" PRId64 " for IntList %zu index %zu",
+              "Invalid value index %" PRId64 " for IntList %" ET_PRIsize_t
+              " index %" ET_PRIsize_t,
               value_index,
               i,
               j);
@@ -369,7 +374,10 @@ Error Method::parse_values() {
         const auto items =
             static_cast<const executorch_flatbuffer::BoolList*>(val)->items();
         ET_CHECK_OR_RETURN_ERROR(
-            items != nullptr, InvalidProgram, "Missing list at index %zu", i);
+            items != nullptr,
+            InvalidProgram,
+            "Missing list at index %" ET_PRIsize_t,
+            i);
         // NOTE: This is technically not portable. A platform could technically
         // define boolean as something longer than a byte. This would be an
         // exceptionally rare case, and this type is currently unused in any
@@ -377,16 +385,19 @@ Error Method::parse_values() {
         // portable here we need to allocate a new array of bool and copy cast
         // the flatbuffer data into it, but because of how exceptionally rare
         // this case is its low prio TODO: jakeszwe
-        new (&values_[i]) EValue(exec_aten::ArrayRef<bool>(
+        new (&values_[i]) EValue(executorch::aten::ArrayRef<bool>(
             (const bool*)items->data(), items->size()));
       } break;
       case executorch_flatbuffer::KernelTypes::DoubleList: {
         const auto items =
             static_cast<const executorch_flatbuffer::DoubleList*>(val)->items();
         ET_CHECK_OR_RETURN_ERROR(
-            items != nullptr, InvalidProgram, "Missing list at index %zu", i);
-        new (&values_[i])
-            EValue(exec_aten::ArrayRef<double>(items->data(), items->size()));
+            items != nullptr,
+            InvalidProgram,
+            "Missing list at index %" ET_PRIsize_t,
+            i);
+        new (&values_[i]) EValue(
+            executorch::aten::ArrayRef<double>(items->data(), items->size()));
       } break;
       case executorch_flatbuffer::KernelTypes::String: {
         const auto fb_str =
@@ -395,7 +406,7 @@ Error Method::parse_values() {
         ET_CHECK_OR_RETURN_ERROR(
             fb_str != nullptr,
             InvalidProgram,
-            "Missing string at index %zu",
+            "Missing string at index %" ET_PRIsize_t,
             i);
         new (&values_[i]) EValue(fb_str->c_str(), fb_str->size());
       } break;
@@ -407,7 +418,7 @@ Error Method::parse_values() {
         if (!t.ok()) {
           ET_LOG(
               Error,
-              "Failed parsing tensor at index %zu: 0x%" PRIx32,
+              "Failed parsing tensor at index %" ET_PRIsize_t ": 0x%" PRIx32,
               i,
               static_cast<uint32_t>(t.error()));
           return t.error();
@@ -429,7 +440,8 @@ Error Method::parse_values() {
         if (!tensors.ok()) {
           ET_LOG(
               Error,
-              "Failed parsing tensor list at index %zu: 0x%" PRIx32,
+              "Failed parsing tensor list at index %" ET_PRIsize_t
+              ": 0x%" PRIx32,
               i,
               static_cast<uint32_t>(tensors.error()));
           return tensors.error();
@@ -444,7 +456,7 @@ Error Method::parse_values() {
             items != nullptr, InvalidProgram, "Missing list at index %zu", i);
         // Same as TensorList but optional<Tensor> instead of Tensor
         auto tensors =
-            deserialization::parseListOptionalType<exec_aten::Tensor>(
+            deserialization::parseListOptionalType<executorch::aten::Tensor>(
                 items,
                 values_,
                 n_value, // The size of the full array.
@@ -452,7 +464,8 @@ Error Method::parse_values() {
         if (!tensors.ok()) {
           ET_LOG(
               Error,
-              "Failed parsing optional tensor list at index %zu: 0x%" PRIx32,
+              "Failed parsing optional tensor list at index %" ET_PRIsize_t
+              ": 0x%" PRIx32,
               i,
               static_cast<uint32_t>(tensors.error()));
           return tensors.error();
@@ -467,7 +480,7 @@ Error Method::parse_values() {
         // schema.fbs
         ET_LOG(
             Error,
-            "Unknown KernelTypes value %" PRIu32 " at index %zu",
+            "Unknown KernelTypes value %" PRIu32 " at index %" ET_PRIsize_t,
             static_cast<uint32_t>(serialization_value->val_type()) - 1,
             i);
         return Error::InvalidProgram;
@@ -509,7 +522,7 @@ Error populate_operator_name(
       cx < operator_name_size,
       Internal,
       "Operator name %s%s%s with length %d "
-      "truncated to %zu due to internal buffer limit.",
+      "truncated to %" ET_PRIsize_t " due to internal buffer limit.",
       op->name()->c_str(),
       has_overload ? "." : "",
       has_overload ? op->overload()->c_str() : "",
@@ -559,8 +572,9 @@ Error Method::resolve_operator(
     if (eval->isTensor()) {
       auto tensor = eval->toTensor();
       meta[count].dtype_ = tensor.scalar_type();
-      exec_aten::DimOrderType* dim_order_ptr =
-          method_allocator->allocateList<exec_aten::DimOrderType>(tensor.dim());
+      executorch::aten::DimOrderType* dim_order_ptr =
+          method_allocator->allocateList<executorch::aten::DimOrderType>(
+              tensor.dim());
       if (dim_order_ptr == nullptr) {
         return Error::MemoryAllocationFailed;
       }
@@ -569,11 +583,11 @@ Error Method::resolve_operator(
       ET_CHECK_OR_RETURN_ERROR(
           err == Error::Ok,
           InvalidArgument,
-          "Error setting dim_order %zu: 0x%" PRIx32,
+          "Error setting dim_order %" ET_PRIsize_t ": 0x%" PRIx32,
           i,
           static_cast<uint32_t>(err));
       meta[count].dim_order_ =
-          Span<exec_aten::DimOrderType>(dim_order_ptr, size);
+          Span<executorch::aten::DimOrderType>(dim_order_ptr, size);
       count++;
     }
   }
@@ -694,7 +708,7 @@ Error Method::init(executorch_flatbuffer::ExecutionPlan* s_plan) {
       ET_CHECK_OR_RETURN_ERROR(
           s_instructions != nullptr,
           InvalidProgram,
-          "Missing instructions in chain %zu",
+          "Missing instructions in chain %" ET_PRIsize_t,
           i);
       auto num_instructions = s_instructions->size();
       auto chain_instruction_kernels =
@@ -717,7 +731,7 @@ Error Method::init(executorch_flatbuffer::ExecutionPlan* s_plan) {
         ET_CHECK_OR_RETURN_ERROR(
             instruction != nullptr && instruction->instr_args() != nullptr,
             InvalidProgram,
-            "Null instruction at index %zu",
+            "Null instruction at index %" ET_PRIsize_t,
             instr_idx);
 
         const void* instr_args = instruction->instr_args();
@@ -783,7 +797,7 @@ Error Method::init(executorch_flatbuffer::ExecutionPlan* s_plan) {
             ET_CHECK_OR_RETURN_ERROR(
                 index >= 0 && index < n_value_,
                 InvalidProgram,
-                "Index %d negative or >= %zu",
+                "Index %d negative or >= %" ET_PRIsize_t,
                 index,
                 n_value_);
             chain_instruction_arg_lists[instr_idx] = InstructionArgs();
@@ -831,7 +845,8 @@ Method::set_input(const EValue& input_evalue, size_t input_idx) {
   ET_CHECK_OR_RETURN_ERROR(
       input_idx < inputs_size(),
       InvalidArgument,
-      "Input index (%zu) must be less than the number of inputs in method (%zu).",
+      "Input index (%" ET_PRIsize_t
+      ") must be less than the number of inputs in method (%" ET_PRIsize_t ").",
       input_idx,
       inputs_size());
 
@@ -843,7 +858,8 @@ Method::set_input(const EValue& input_evalue, size_t input_idx) {
     tag_to_string(e.tag, tag_name.data(), tag_name.size());
     ET_LOG(
         Error,
-        "Input %zu was expected to be a Tensor or primitive but was %s.",
+        "Input %" ET_PRIsize_t
+        " was expected to be a Tensor or primitive but was %s.",
         input_idx,
         tag_name.data());
 #endif
@@ -876,7 +892,8 @@ Method::set_input(const EValue& input_evalue, size_t input_idx) {
     ET_CHECK_OR_RETURN_ERROR(
         t_dst.scalar_type() == t_src.scalar_type(),
         InvalidArgument,
-        "Input %zu has unexpected scalar type: expected %s but was %s.",
+        "Input %" ET_PRIsize_t
+        " has unexpected scalar type: expected %s but was %s.",
         input_idx,
         executorch::runtime::toString(t_dst.scalar_type()),
         executorch::runtime::toString(t_src.scalar_type()));
@@ -886,7 +903,7 @@ Method::set_input(const EValue& input_evalue, size_t input_idx) {
     ET_CHECK_OR_RETURN_ERROR(
         err == Error::Ok,
         InvalidArgument,
-        "Error setting input %zu: 0x%" PRIx32,
+        "Error setting input %" ET_PRIsize_t ": 0x%" PRIx32,
         input_idx,
         static_cast<uint32_t>(err));
     Error error;
@@ -899,7 +916,7 @@ Method::set_input(const EValue& input_evalue, size_t input_idx) {
     ET_CHECK_OR_RETURN_ERROR(
         error == Error::Ok,
         InvalidArgument,
-        "Error setting data_ptr %zu: 0x%" PRIx32,
+        "Error setting data_ptr %" ET_PRIsize_t ": 0x%" PRIx32,
         input_idx,
         static_cast<uint32_t>(error));
     // Prims have to be the same as what was traced
@@ -907,7 +924,8 @@ Method::set_input(const EValue& input_evalue, size_t input_idx) {
     ET_CHECK_OR_RETURN_ERROR(
         e.toInt() == input_evalue.toInt(),
         InvalidArgument,
-        "The %zu-th input of method should have the same value as the input_evalue, but got %" PRId64
+        "The %" ET_PRIsize_t
+        "-th input of method should have the same value as the input_evalue, but got %" PRId64
         " and %" PRId64,
         input_idx,
         e.toInt(),
@@ -916,7 +934,8 @@ Method::set_input(const EValue& input_evalue, size_t input_idx) {
     ET_CHECK_OR_RETURN_ERROR(
         e.toBool() == input_evalue.toBool(),
         InvalidArgument,
-        "The %zu-th input of method should have the same value as the input_evalue, but got %" PRId64
+        "The %" ET_PRIsize_t
+        "-th input of method should have the same value as the input_evalue, but got %" PRId64
         " and %" PRId64,
         input_idx,
         (int64_t)e.toBool(),
@@ -944,7 +963,8 @@ Method::set_input(const EValue& input_evalue, size_t input_idx) {
     ET_CHECK_OR_RETURN_ERROR(
         is_equal,
         InvalidArgument,
-        "The %zu-th input of method should have the same value as the input_evalue, but get %f and %f",
+        "The %" ET_PRIsize_t
+        "-th input of method should have the same value as the input_evalue, but get %f and %f",
         input_idx,
         lhs,
         rhs);
@@ -952,7 +972,8 @@ Method::set_input(const EValue& input_evalue, size_t input_idx) {
     ET_CHECK_OR_RETURN_ERROR(
         e.toString() == input_evalue.toString(),
         InvalidArgument,
-        "The %zu-th input of method should have the same value as the input_evalue, but get %s and %s",
+        "The %" ET_PRIsize_t
+        "-th input of method should have the same value as the input_evalue, but get %s and %s",
         input_idx,
         e.toString().data(),
         input_evalue.toString().data());
@@ -969,7 +990,7 @@ Method::set_input(const EValue& input_evalue, size_t input_idx) {
 }
 
 ET_NODISCARD Error
-Method::set_inputs(const exec_aten::ArrayRef<EValue>& input_evalues) {
+Method::set_inputs(const executorch::aten::ArrayRef<EValue>& input_evalues) {
   ET_CHECK_OR_RETURN_ERROR(
       initialized(),
       InvalidState,
@@ -984,7 +1005,8 @@ Method::set_inputs(const exec_aten::ArrayRef<EValue>& input_evalues) {
   ET_CHECK_OR_RETURN_ERROR(
       input_size == input_evalues.size(),
       InvalidArgument,
-      "The length of given input array (%zu) must be same as the number of inputs in method (%zu).",
+      "The length of given input array (%" ET_PRIsize_t
+      ") must be same as the number of inputs in method (%" ET_PRIsize_t ").",
       input_evalues.size(),
       input_size);
 
@@ -1009,7 +1031,7 @@ Method::set_output_data_ptr(void* buffer, size_t size, size_t output_idx) {
   ET_CHECK_OR_RETURN_ERROR(
       output_idx < outputs_size(),
       InvalidArgument,
-      "output_idx: %zu > num_outputs: %zu",
+      "output_idx: %" ET_PRIsize_t " > num_outputs: %" ET_PRIsize_t,
       output_idx,
       outputs_size());
 
@@ -1028,7 +1050,8 @@ Method::set_output_data_ptr(void* buffer, size_t size, size_t output_idx) {
   if (tensor_meta->is_memory_planned()) {
     ET_LOG(
         Error,
-        "Output %zu is memory planned, or is a constant. Cannot override "
+        "Output %" ET_PRIsize_t
+        " is memory planned, or is a constant. Cannot override "
         "the existing data pointer.",
         output_idx);
     return Error::InvalidState;
@@ -1048,7 +1071,8 @@ Method::set_output_data_ptr(void* buffer, size_t size, size_t output_idx) {
   ET_CHECK_OR_RETURN_ERROR(
       t.nbytes() <= size,
       InvalidArgument,
-      "buffer size: %zu is smaller then expected tensor size: %zu",
+      "buffer size: %" ET_PRIsize_t
+      " is smaller then expected tensor size: %" ET_PRIsize_t,
       size,
       t.nbytes());
 
@@ -1107,7 +1131,8 @@ Error Method::execute_instruction() {
   ET_CHECK_OR_RETURN_ERROR(
       step_state_.instr_idx < instructions->size(),
       Internal,
-      "Instr index %zu >= chain[%zu] instr count %zu",
+      "Instr index %" ET_PRIsize_t " >= chain[%" ET_PRIsize_t
+      "] instr count %" ET_PRIsize_t,
       step_state_.instr_idx,
       step_state_.chain_idx,
       (size_t)instructions->size());
@@ -1134,7 +1159,8 @@ Error Method::execute_instruction() {
         ET_UNUSED auto op = serialization_plan_->operators()->Get(op_index);
         ET_LOG(
             Error,
-            "KernelCall failed at instruction %zu:%zu in operator %s.%s: 0x%x",
+            "KernelCall failed at instruction %" ET_PRIsize_t ":%" ET_PRIsize_t
+            " in operator %s.%s: 0x%x",
             step_state_.chain_idx,
             step_state_.instr_idx,
             op->name()->c_str(),
@@ -1163,8 +1189,8 @@ Error Method::execute_instruction() {
       ET_CHECK_OR_RETURN_ERROR(
           delegate_idx < n_delegate_,
           Internal,
-          "DELEGATE_CALL index %" PRIu32
-          " >= num delegates %zu at instruction %zu",
+          "DELEGATE_CALL index %" PRIu32 " >= num delegates %" ET_PRIsize_t
+          " at instruction %" ET_PRIsize_t,
           delegate_idx,
           n_delegate_,
           step_state_.instr_idx);
@@ -1178,7 +1204,8 @@ Error Method::execute_instruction() {
       if (err != Error::Ok) {
         ET_LOG(
             Error,
-            "CALL_DELEGATE execute failed at instruction %zu: 0x%" PRIx32,
+            "CALL_DELEGATE execute failed at instruction %" ET_PRIsize_t
+            ": 0x%" PRIx32,
             step_state_.instr_idx,
             static_cast<uint32_t>(err));
       }
@@ -1351,7 +1378,7 @@ Error Method::execute() {
     ET_CHECK_OR_RETURN_ERROR(
         instructions != nullptr,
         Internal,
-        "chain %zu has no instructions field",
+        "chain %" ET_PRIsize_t " has no instructions field",
         step_state_.chain_idx);
 
     // Loop over instructions
@@ -1391,12 +1418,14 @@ MethodMeta Method::method_meta() const {
 }
 
 const EValue& Method::get_value(size_t i) const {
-  ET_CHECK_MSG(i < n_value_, "%zu >= %zu", i, n_value_);
+  ET_CHECK_MSG(
+      i < n_value_, "%" ET_PRIsize_t " >= %" ET_PRIsize_t, i, n_value_);
   return values_[i];
 }
 
 EValue& Method::mutable_value(size_t i) {
-  ET_CHECK_MSG(i < n_value_, "%zu >= %zu", i, n_value_);
+  ET_CHECK_MSG(
+      i < n_value_, "%" ET_PRIsize_t " >= %" ET_PRIsize_t, i, n_value_);
   return values_[i];
 }
 
@@ -1406,7 +1435,11 @@ size_t Method::inputs_size() const {
 }
 
 size_t Method::get_input_index(size_t i) const {
-  ET_CHECK_MSG(i < inputs_size(), "%zu >= %zu", i, inputs_size());
+  ET_CHECK_MSG(
+      i < inputs_size(),
+      "%" ET_PRIsize_t " >= %" ET_PRIsize_t,
+      i,
+      inputs_size());
   return static_cast<size_t>(serialization_plan_->inputs()->Get(i));
 }
 
@@ -1424,7 +1457,11 @@ size_t Method::outputs_size() const {
 }
 
 size_t Method::get_output_index(size_t i) const {
-  ET_CHECK_MSG(i < outputs_size(), "%zu >= %zu", i, outputs_size());
+  ET_CHECK_MSG(
+      i < outputs_size(),
+      "%" ET_PRIsize_t " >= %" ET_PRIsize_t,
+      i,
+      outputs_size());
   return static_cast<size_t>(serialization_plan_->outputs()->Get(i));
 }
 
diff --git a/runtime/executor/method_meta.cpp b/runtime/executor/method_meta.cpp
index 5be486b4d8..0a35570e35 100644
--- a/runtime/executor/method_meta.cpp
+++ b/runtime/executor/method_meta.cpp
@@ -54,7 +54,7 @@ Result<Tag> get_tag(
 
 size_t calculate_nbytes(
     Span<const int32_t> sizes,
-    exec_aten::ScalarType scalar_type) {
+    executorch::aten::ScalarType scalar_type) {
   ssize_t n = 1;
   for (ssize_t i = 0; i < sizes.size(); i++) {
     n *= sizes[i];
@@ -68,7 +68,7 @@ size_t calculate_nbytes(
 TensorInfo::TensorInfo(
     Span<const int32_t> sizes,
     Span<const uint8_t> dim_order,
-    exec_aten::ScalarType scalar_type,
+    executorch::aten::ScalarType scalar_type,
     const bool is_memory_planned)
     : sizes_(sizes),
       dim_order_(dim_order),
@@ -84,7 +84,7 @@ Span<const uint8_t> TensorInfo::dim_order() const {
   return dim_order_;
 }
 
-exec_aten::ScalarType TensorInfo::scalar_type() const {
+executorch::aten::ScalarType TensorInfo::scalar_type() const {
   return scalar_type_;
 }
 
@@ -147,7 +147,7 @@ Result<TensorInfo> MethodMeta::input_tensor_meta(size_t index) const {
           tensor_value->sizes()->data(), tensor_value->sizes()->size()),
       Span<const uint8_t>(
           tensor_value->dim_order()->data(), tensor_value->dim_order()->size()),
-      static_cast<exec_aten::ScalarType>(tensor_value->scalar_type()),
+      static_cast<executorch::aten::ScalarType>(tensor_value->scalar_type()),
       tensor_value->allocation_info() != nullptr ||
           tensor_value->data_buffer_idx() !=
               0); // Count constant returns as memory planned.
@@ -198,7 +198,7 @@ Result<TensorInfo> MethodMeta::output_tensor_meta(size_t index) const {
           tensor_value->sizes()->data(), tensor_value->sizes()->size()),
       Span<const uint8_t>(
           tensor_value->dim_order()->data(), tensor_value->dim_order()->size()),
-      static_cast<exec_aten::ScalarType>(tensor_value->scalar_type()),
+      static_cast<executorch::aten::ScalarType>(tensor_value->scalar_type()),
       tensor_value->allocation_info() != nullptr ||
           tensor_value->data_buffer_idx() !=
               0); // Count constant returns as memory planned.
diff --git a/runtime/executor/tensor_parser_exec_aten.cpp b/runtime/executor/tensor_parser_exec_aten.cpp
index a3844a34b4..4feae45299 100644
--- a/runtime/executor/tensor_parser_exec_aten.cpp
+++ b/runtime/executor/tensor_parser_exec_aten.cpp
@@ -68,7 +68,7 @@ ET_NODISCARD Result<void*> getMemPlannedPtr(
 }
 } // namespace
 
-ET_NODISCARD Result<BoxedEvalueList<exec_aten::Tensor>> parseTensorList(
+ET_NODISCARD Result<BoxedEvalueList<executorch::aten::Tensor>> parseTensorList(
     const flatbuffers::Vector<int32_t>* tensor_indices,
     EValue* values,
     size_t values_len,
@@ -76,8 +76,8 @@ ET_NODISCARD Result<BoxedEvalueList<exec_aten::Tensor>> parseTensorList(
   EXECUTORCH_SCOPE_PROF("TensorParser::parseTensorList");
 
   auto* tensor_list =
-      memory_manager->method_allocator()->allocateList<exec_aten::Tensor>(
-          tensor_indices->size());
+      memory_manager->method_allocator()
+          ->allocateList<executorch::aten::Tensor>(tensor_indices->size());
   if (tensor_list == nullptr) {
     return Error::MemoryAllocationFailed;
   }
@@ -99,13 +99,13 @@ ET_NODISCARD Result<BoxedEvalueList<exec_aten::Tensor>> parseTensorList(
 
     // Placement new as the list elements are not initialized, so calling
     // copy assignment is not defined if it's non trivial.
-    new (&tensor_list[output_idx])
-        exec_aten::Tensor(values[static_cast<size_t>(tensor_index)].toTensor());
+    new (&tensor_list[output_idx]) executorch::aten::Tensor(
+        values[static_cast<size_t>(tensor_index)].toTensor());
     evalp_list[output_idx] = &values[static_cast<size_t>(tensor_index)];
     output_idx++;
   }
 
-  return BoxedEvalueList<exec_aten::Tensor>(
+  return BoxedEvalueList<executorch::aten::Tensor>(
       evalp_list, tensor_list, tensor_indices->size());
 }
 
diff --git a/runtime/executor/tensor_parser_portable.cpp b/runtime/executor/tensor_parser_portable.cpp
index 79e4c4bd96..414961e0ff 100644
--- a/runtime/executor/tensor_parser_portable.cpp
+++ b/runtime/executor/tensor_parser_portable.cpp
@@ -41,9 +41,9 @@ Result<Tensor> parseTensor(
   ET_CHECK_OR_RETURN_ERROR(
       isValid(scalar_type) &&
           // Types that do not yet have deserialization support.
-          scalar_type != exec_aten::ScalarType::ComplexHalf &&
-          scalar_type != exec_aten::ScalarType::ComplexFloat &&
-          scalar_type != exec_aten::ScalarType::ComplexDouble,
+          scalar_type != executorch::aten::ScalarType::ComplexHalf &&
+          scalar_type != executorch::aten::ScalarType::ComplexFloat &&
+          scalar_type != executorch::aten::ScalarType::ComplexDouble,
       InvalidProgram,
       "Invalid or unsupported ScalarType %" PRId8,
       static_cast<int8_t>(scalar_type));
@@ -74,32 +74,34 @@ Result<Tensor> parseTensor(
       dim);
   const auto serialized_dim_order = s_tensor->dim_order()->data();
 
-  exec_aten::SizesType* sizes = nullptr;
-  exec_aten::DimOrderType* dim_order = nullptr;
+  executorch::aten::SizesType* sizes = nullptr;
+  executorch::aten::DimOrderType* dim_order = nullptr;
   // For dynamic shape tensors, allocate local buffers to allow mutable sizes
   // and strides
   if (dynamism != TensorShapeDynamism::STATIC) {
     // copy sizes and dim order out of flatbuffer
     // kimishpate: I think dim order can remain immutable and point to fb
     // memory, unless we plan to implement in-place permute
-    exec_aten::SizesType* sizes_buf = ET_ALLOCATE_LIST_OR_RETURN_ERROR(
-        method_allocator, exec_aten::SizesType, dim);
-    exec_aten::DimOrderType* dim_order_buf = ET_ALLOCATE_LIST_OR_RETURN_ERROR(
-        method_allocator, exec_aten::DimOrderType, dim);
+    executorch::aten::SizesType* sizes_buf = ET_ALLOCATE_LIST_OR_RETURN_ERROR(
+        method_allocator, executorch::aten::SizesType, dim);
+    executorch::aten::DimOrderType* dim_order_buf =
+        ET_ALLOCATE_LIST_OR_RETURN_ERROR(
+            method_allocator, executorch::aten::DimOrderType, dim);
     std::memcpy(
-        sizes_buf, serialized_sizes, sizeof(exec_aten::SizesType) * dim);
+        sizes_buf, serialized_sizes, sizeof(executorch::aten::SizesType) * dim);
     std::memcpy(
         dim_order_buf,
         serialized_dim_order,
-        sizeof(exec_aten::DimOrderType) * dim);
+        sizeof(executorch::aten::DimOrderType) * dim);
 
     sizes = sizes_buf;
     dim_order = dim_order_buf;
   } else {
     // Const cast safe here as these tensors can't be resized, so these fields
     // will not be modified.
-    sizes = const_cast<exec_aten::SizesType*>(serialized_sizes);
-    dim_order = const_cast<exec_aten::DimOrderType*>(serialized_dim_order);
+    sizes = const_cast<executorch::aten::SizesType*>(serialized_sizes);
+    dim_order =
+        const_cast<executorch::aten::DimOrderType*>(serialized_dim_order);
   }
   // Validate sizes before using them in case the PTE data is bad. We can't
   // detect bad positive values, but we can reject negative values, which would
@@ -118,8 +120,8 @@ Result<Tensor> parseTensor(
   // Allocating strides buffer here and populating it.
   // In subsequent diffs we can remove strides accessor, however this
   // will introduce incompatible APIs between ATen Tensor and ETensor.
-  exec_aten::StridesType* strides = ET_ALLOCATE_LIST_OR_RETURN_ERROR(
-      method_allocator, exec_aten::StridesType, dim);
+  executorch::aten::StridesType* strides = ET_ALLOCATE_LIST_OR_RETURN_ERROR(
+      method_allocator, executorch::aten::StridesType, dim);
   auto status = dim_order_to_stride(sizes, dim_order, dim, strides);
   ET_CHECK_OR_RETURN_ERROR(
       status == Error::Ok,
diff --git a/runtime/executor/test/allocation_failure_stress_test.cpp b/runtime/executor/test/allocation_failure_stress_test.cpp
index 9e0c857b93..8d9614c858 100644
--- a/runtime/executor/test/allocation_failure_stress_test.cpp
+++ b/runtime/executor/test/allocation_failure_stress_test.cpp
@@ -21,9 +21,9 @@
 #include <gtest/gtest.h>
 
 using namespace ::testing;
-using exec_aten::ArrayRef;
-using exec_aten::Scalar;
-using exec_aten::Tensor;
+using executorch::aten::ArrayRef;
+using executorch::aten::Scalar;
+using executorch::aten::Tensor;
 using executorch::extension::FileDataLoader;
 using executorch::extension::prepare_input_tensors;
 using executorch::runtime::Error;
diff --git a/runtime/executor/test/backend_integration_test.cpp b/runtime/executor/test/backend_integration_test.cpp
index 37a653b4d9..fa63a260a4 100644
--- a/runtime/executor/test/backend_integration_test.cpp
+++ b/runtime/executor/test/backend_integration_test.cpp
@@ -28,7 +28,7 @@
 #include <gtest/gtest.h>
 
 using namespace ::testing;
-using exec_aten::ArrayRef;
+using executorch::aten::ArrayRef;
 using executorch::runtime::BackendExecutionContext;
 using executorch::runtime::BackendInitContext;
 using executorch::runtime::BackendInterface;
diff --git a/runtime/executor/test/executor_test.cpp b/runtime/executor/test/executor_test.cpp
index 15b3982297..328b23a8df 100644
--- a/runtime/executor/test/executor_test.cpp
+++ b/runtime/executor/test/executor_test.cpp
@@ -17,11 +17,11 @@
 #include <executorch/runtime/platform/runtime.h>
 #include <executorch/test/utils/DeathTest.h>
 
-using exec_aten::IntArrayRef;
-using exec_aten::Scalar;
-using exec_aten::ScalarType;
-using exec_aten::SizesType;
-using exec_aten::Tensor;
+using executorch::aten::IntArrayRef;
+using executorch::aten::Scalar;
+using executorch::aten::ScalarType;
+using executorch::aten::SizesType;
+using executorch::aten::Tensor;
 using executorch::runtime::Error;
 using executorch::runtime::EValue;
 using executorch::runtime::get_op_function_from_registry;
@@ -80,7 +80,7 @@ TEST_F(ExecutorTest, TensorHalf) {
   ASSERT_EQ(a.numel(), 4);
   ASSERT_EQ(a.scalar_type(), ScalarType::Half);
 
-  auto data_p = a.const_data_ptr<exec_aten::Half>();
+  auto data_p = a.const_data_ptr<executorch::aten::Half>();
   ASSERT_NEAR(
       data_p[0], 1.0f, toleranceFloat16(fmax(fabs(1.0f), fabs(data_p[0]))));
   ASSERT_NEAR(
diff --git a/runtime/executor/test/kernel_integration_test.cpp b/runtime/executor/test/kernel_integration_test.cpp
index 4f1ac0240b..dab76ed8c1 100644
--- a/runtime/executor/test/kernel_integration_test.cpp
+++ b/runtime/executor/test/kernel_integration_test.cpp
@@ -101,7 +101,7 @@ struct KernelControl {
     // Float with dim order (0, 1)
 
     // Construct a kernel key with the following meta:
-    // exec_aten::DimOrderType contiguous[] = {0, 1};
+    // executorch::aten::DimOrderType contiguous[] = {0, 1};
     // TensorMeta float_contiguous[] = {
     //     TensorMeta(ScalarType::Float, contiguous), // self
     //     TensorMeta(ScalarType::Float, contiguous), // other
diff --git a/runtime/executor/test/kernel_resolution_test.cpp b/runtime/executor/test/kernel_resolution_test.cpp
index aae0ff9b7e..8c96e29fd0 100644
--- a/runtime/executor/test/kernel_resolution_test.cpp
+++ b/runtime/executor/test/kernel_resolution_test.cpp
@@ -25,8 +25,8 @@
 #include <gtest/gtest.h>
 
 using namespace ::testing;
-using exec_aten::Scalar;
-using exec_aten::ScalarType;
+using executorch::aten::Scalar;
+using executorch::aten::ScalarType;
 using executorch::runtime::Error;
 using executorch::runtime::EValue;
 using executorch::runtime::Kernel;
@@ -97,7 +97,7 @@ TEST_F(KernelResolutionTest, ResolveKernelKeySuccess) {
   // args are Float with dim order (0, 1)
 
   // Construct a kernel key with the following meta:
-  // exec_aten::DimOrderType contiguous[] = {0, 1};
+  // executorch::aten::DimOrderType contiguous[] = {0, 1};
   // TensorMeta float_contiguous[] = {
   //     TensorMeta(ScalarType::Float, contiguous),
   //     TensorMeta(ScalarType::Float, contiguous),
diff --git a/runtime/executor/test/method_meta_test.cpp b/runtime/executor/test/method_meta_test.cpp
index 1cb20496f5..e9f09c38a5 100644
--- a/runtime/executor/test/method_meta_test.cpp
+++ b/runtime/executor/test/method_meta_test.cpp
@@ -57,7 +57,7 @@ void check_tensor(const TensorInfo& tensor_info) {
   EXPECT_EQ(sizes.size(), 2);
   EXPECT_EQ(sizes[0], 2);
   EXPECT_EQ(sizes[1], 2);
-  EXPECT_EQ(tensor_info.scalar_type(), exec_aten::ScalarType::Float);
+  EXPECT_EQ(tensor_info.scalar_type(), executorch::aten::ScalarType::Float);
   EXPECT_EQ(dim_order.size(), 2);
   EXPECT_EQ(dim_order[0], 0);
   EXPECT_EQ(dim_order[1], 1);
diff --git a/runtime/executor/test/method_test.cpp b/runtime/executor/test/method_test.cpp
index 0163c8ceef..8ef4cfcb36 100644
--- a/runtime/executor/test/method_test.cpp
+++ b/runtime/executor/test/method_test.cpp
@@ -20,7 +20,7 @@
 #include <gtest/gtest.h>
 
 using namespace ::testing;
-using exec_aten::ArrayRef;
+using executorch::aten::ArrayRef;
 using executorch::extension::prepare_input_tensors;
 using executorch::runtime::Error;
 using executorch::runtime::EValue;
@@ -215,10 +215,16 @@ TEST_F(MethodTest, AliasedIOTest) {
   int32_t sizes[2] = {2, 4};
   uint8_t dim_order[2] = {0, 1};
   int32_t strides[2] = {4, 1};
-  exec_aten::TensorImpl impl(
-      exec_aten::ScalarType::Float, 2, sizes, buffer, dim_order, strides);
-
-  auto input_err = method->set_input(EValue(exec_aten::Tensor(&impl)), 0);
+  executorch::aten::TensorImpl impl(
+      executorch::aten::ScalarType::Float,
+      2,
+      sizes,
+      buffer,
+      dim_order,
+      strides);
+
+  auto input_err =
+      method->set_input(EValue(executorch::aten::Tensor(&impl)), 0);
   ASSERT_EQ(input_err, Error::Ok);
 
   auto output_err = method->set_output_data_ptr(buffer, sizeof(buffer), 0);
@@ -245,9 +251,14 @@ TEST_F(MethodTest, AliasedIOTest) {
 
   // Set the input again to update the size.
   sizes[0] = output.toTensor().sizes()[0];
-  exec_aten::TensorImpl impl_2(
-      exec_aten::ScalarType::Float, 2, sizes, buffer, dim_order, strides);
-  input_err = method->set_input(EValue(exec_aten::Tensor(&impl_2)), 0);
+  executorch::aten::TensorImpl impl_2(
+      executorch::aten::ScalarType::Float,
+      2,
+      sizes,
+      buffer,
+      dim_order,
+      strides);
+  input_err = method->set_input(EValue(executorch::aten::Tensor(&impl_2)), 0);
   ASSERT_EQ(input_err, Error::Ok);
 
   // Execute the method again. Cat a 1x4 to a 3x4.
diff --git a/runtime/executor/test/tensor_parser_test.cpp b/runtime/executor/test/tensor_parser_test.cpp
index 65a2195342..b193ab229b 100644
--- a/runtime/executor/test/tensor_parser_test.cpp
+++ b/runtime/executor/test/tensor_parser_test.cpp
@@ -16,8 +16,8 @@
 #include <gtest/gtest.h>
 
 using namespace ::testing;
-using exec_aten::ScalarType;
-using exec_aten::Tensor;
+using executorch::aten::ScalarType;
+using executorch::aten::Tensor;
 using executorch::runtime::Error;
 using executorch::runtime::EValue;
 using executorch::runtime::FreeableBuffer;
@@ -116,7 +116,8 @@ TEST_F(TensorParserTest, TestModuleAddFloat) {
 }
 
 TEST_F(TensorParserTest, TestModuleAddHalf) {
-  test_module_add(half_loader_, ScalarType::Half, sizeof(exec_aten::Half));
+  test_module_add(
+      half_loader_, ScalarType::Half, sizeof(executorch::aten::Half));
 }
 
 TEST_F(TensorParserTest, TestMutableState) {
diff --git a/runtime/kernel/kernel_runtime_context.h b/runtime/kernel/kernel_runtime_context.h
index 96ad3d51e3..ad269f5dd4 100644
--- a/runtime/kernel/kernel_runtime_context.h
+++ b/runtime/kernel/kernel_runtime_context.h
@@ -126,6 +126,6 @@ namespace aten {
 using RuntimeContext = ::executorch::runtime::KernelRuntimeContext;
 } // namespace aten
 } // namespace executorch
-// DEPRECATED: The exec_aten:: namespace is deprecated. Use executorch::aten::
-// instead.
+// DEPRECATED: The executorch::aten:: namespace is deprecated. Use
+// executorch::aten:: instead.
 namespace exec_aten = ::executorch::aten;
diff --git a/runtime/kernel/test/operator_registry_test.cpp b/runtime/kernel/test/operator_registry_test.cpp
index 57439a2bd0..15104609b9 100644
--- a/runtime/kernel/test/operator_registry_test.cpp
+++ b/runtime/kernel/test/operator_registry_test.cpp
@@ -19,9 +19,9 @@
 #include <executorch/test/utils/DeathTest.h>
 
 using namespace ::testing;
-using exec_aten::Scalar;
-using exec_aten::ScalarType;
-using exec_aten::Tensor;
+using executorch::aten::Scalar;
+using executorch::aten::ScalarType;
+using executorch::aten::Tensor;
 using executorch::runtime::Error;
 using executorch::runtime::EValue;
 using executorch::runtime::get_op_function_from_registry;
diff --git a/runtime/platform/compiler.h b/runtime/platform/compiler.h
index bc07470387..7467d5c1e0 100644
--- a/runtime/platform/compiler.h
+++ b/runtime/platform/compiler.h
@@ -146,6 +146,13 @@
 #define ET_FUNCTION __FUNCTION__
 #endif // __has_builtin(__builtin_FUNCTION)
 
+// As of G3 RJ-2024.3 toolchain, zu format specifier is not supported for Xtensa
+#if defined(__XTENSA__)
+#define ET_PRIsize_t "lu"
+#else
+#define ET_PRIsize_t "zu"
+#endif
+
 // Whether the compiler supports GNU statement expressions.
 // https://gcc.gnu.org/onlinedocs/gcc/Statement-Exprs.html
 #ifndef ET_HAVE_GNU_STATEMENT_EXPRESSIONS
diff --git a/setup.py b/setup.py
index 5e8f155353..87c95f0515 100644
--- a/setup.py
+++ b/setup.py
@@ -86,6 +86,10 @@ def _is_env_enabled(env_var: str, default: bool = False) -> bool:
     def pybindings(cls) -> bool:
         return cls._is_env_enabled("EXECUTORCH_BUILD_PYBIND", default=False)
 
+    @classmethod
+    def training(cls) -> bool:
+        return cls._is_env_enabled("EXECUTORCH_BUILD_TRAINING", default=False)
+
     @classmethod
     def llama_custom_ops(cls) -> bool:
         return cls._is_env_enabled("EXECUTORCH_BUILD_KERNELS_CUSTOM_AOT", default=True)
@@ -575,6 +579,11 @@ def run(self):
                 "-DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON",  # add quantized ops to pybindings.
                 "-DEXECUTORCH_BUILD_KERNELS_QUANTIZED_AOT=ON",
             ]
+            if ShouldBuild.training():
+                cmake_args += [
+                    "-DEXECUTORCH_BUILD_EXTENSION_TRAINING=ON",
+                ]
+                build_args += ["--target", "_training_lib"]
             build_args += ["--target", "portable_lib"]
             # To link backends into the portable_lib target, callers should
             # add entries like `-DEXECUTORCH_BUILD_XNNPACK=ON` to the CMAKE_ARGS
@@ -677,6 +686,14 @@ def get_ext_modules() -> List[Extension]:
                 "_portable_lib.*", "executorch.extension.pybindings._portable_lib"
             )
         )
+        if ShouldBuild.training():
+            ext_modules.append(
+                # Install the prebuilt pybindings extension wrapper for training
+                BuiltExtension(
+                    "_training_lib.*",
+                    "executorch.extension.training.pybindings._training_lib",
+                )
+            )
     if ShouldBuild.llama_custom_ops():
         ext_modules.append(
             BuiltFile(