2024-04-30 nightly release (38fad23)

pytorch · Apr 30, 2024 · 2a03600 · 2a03600
1 parent 8ba0eff
commit 2a03600
Show file tree

Hide file tree

Showing 33 changed files with 1,494 additions and 593 deletions.
diff --git a/.github/workflows/doc-build.yml b/.github/workflows/doc-build.yml
@@ -94,11 +94,11 @@ jobs:
         # Get github.ref for the output doc folder. By default "main"
         # If matches a tag like refs/tags/v1.12.0-rc3 or
         # refs/tags/v1.12.0 convert to 1.12
-        GITHUB_REF=${{ github.ref }}
+        export GITHUB_REF=${{ github.ref }}
 
         # Convert refs/tags/v1.12.0rc3 into 1.12.
         # Adopted from https://github.com/pytorch/pytorch/blob/main/.github/workflows/_docs.yml#L150C11-L155C13
-        if [[ "${GITHUB_REF}" =~ ^refs/tags/v([0-9]+\\.[0-9]+)\\. ]]; then
+        if [[ "${GITHUB_REF}" =~ ^refs/tags/v([0-9]+\.[0-9]+) ]]; then
           TARGET_FOLDER="${BASH_REMATCH[1]}"
         else
           TARGET_FOLDER="main"

diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml
@@ -239,6 +239,70 @@ jobs:
         # see if we can import the module successfully
         python -c "from executorch.extension.pybindings import portable_lib; print('success!')"
 
+  test-binary-size-linux-gcc:
+    name: test-binary-size-linux-gcc
+    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    strategy:
+      fail-fast: false
+    with:
+      runner: linux.2xlarge
+      docker-image: executorch-ubuntu-22.04-gcc9
+      submodules: 'true'
+      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+      timeout: 90
+      script: |
+        # The generic Linux job chooses to use base env, not the one setup by the image
+        CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
+        conda activate "${CONDA_ENV}"
+
+        # build module for executorch.extension.pybindings.portable_lib
+        bash test/build_size_test.sh
+        strip cmake-out/test/size_test
+        output=$(ls -la cmake-out/test/size_test)
+        arr=($output)
+        size=${arr[4]}
+        # threshold=48120 on devserver with gcc11.4
+        # todo(lfq): update once binary size is below 50kb.
+        threshold="51504"
+        if [[ "$size" -le "$threshold" ]]; then
+          echo "Success $size <= $threshold"
+        else
+          echo "Fail $size > $threshold"
+          exit 1
+        fi
+
+  test-binary-size-linux:
+    name: test-binary-size-linux
+    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    strategy:
+      fail-fast: false
+    with:
+      runner: linux.2xlarge
+      docker-image: executorch-ubuntu-22.04-clang12
+      submodules: 'true'
+      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+      timeout: 90
+      script: |
+        # The generic Linux job chooses to use base env, not the one setup by the image
+        CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
+        conda activate "${CONDA_ENV}"
+
+        # build module for executorch.extension.pybindings.portable_lib
+        bash test/build_size_test.sh
+        strip cmake-out/test/size_test
+        output=$(ls -la cmake-out/test/size_test)
+        arr=($output)
+        size=${arr[4]}
+        # threshold=48120 on devserver with gcc11.4
+        # todo(lfq): update once binary size is below 50kb.
+        threshold="51768"
+        if [[ "$size" -le "$threshold" ]]; then
+          echo "Success $size <= $threshold"
+        else
+          echo "Fail $size > $threshold"
+          exit 1
+        fi
+
   unittest:
     uses: ./.github/workflows/_unittest.yml
     with:

diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -120,7 +120,7 @@ endif()
 # disables exceptions and runtime type.
 set(CMAKE_CXX_FLAGS_RELEASE
     "-ffunction-sections -fdata-sections -fno-exceptions -fno-rtti")
-if(NOT APPLE)
+if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
   set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -s")
 endif()
 

diff --git a/backends/vulkan/runtime/graph/ops/glsl/copy_channel_offset.glsl b/backends/vulkan/runtime/graph/ops/glsl/copy_channel_offset.glsl
@@ -0,0 +1,78 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#version 450 core
+
+#define PRECISION ${PRECISION}
+
+#define VEC4_T ${texel_type(DTYPE)}
+
+layout(std430) buffer;
+
+#include "indexing_utils.h"
+
+layout(set = 0, binding = 0, ${IMAGE_FORMAT[DTYPE]}) uniform PRECISION restrict writeonly ${IMAGE_T[NDIM][DTYPE]} image_out;
+layout(set = 0, binding = 1) uniform PRECISION sampler3D existing_out;
+layout(set = 0, binding = 2) uniform PRECISION sampler3D image_in;
+
+layout(set = 0, binding = 3) uniform PRECISION restrict CopyArgs {
+  ivec4 out_sizes;
+  ivec4 in_sizes;
+  // Analogus to range variable in copy. It defines the # of channel being
+  // copied.
+  int channel_range;  
+  int src_channel_offset;
+  int dst_channel_offset;
+  int unused; 
+  // Operates on (x, y, z) extents. 
+  ivec3 range;
+  int unused1;
+  ivec3 dst_offset;
+  int unused2;
+};
+
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+layout(constant_id = 3) const int packed_dim = C_DIM;
+
+void main() {
+  // Note: Unlike other shaders, the range is often not equal to the destination
+  // texture extent.
+  const ivec3 pos = ivec3(gl_GlobalInvocationID);
+  if (any(greaterThanEqual(pos, range))) {
+    return;
+  }
+
+  const ivec3 out_pos = pos + dst_offset;
+
+  const ivec4 out_whcn = to_tensor_idx(out_pos, out_sizes, packed_dim);
+
+  // First read the existing values to make sure the boundary values stay.
+  VEC4_T v = VEC4_T(texelFetch(existing_out, out_pos, 0));
+
+  for (int i=0; i<4; i++) {
+    ivec4 in_whcn = out_whcn;
+
+    in_whcn.z = out_whcn.z - dst_channel_offset + i;
+
+    // Handle the partial update for begining of channel in an existing tensor.
+    // If the source channel index is below zero or exceeds the range, we skip
+    // updating the element to avoid overwriting existing data.
+    if ((in_whcn.z < 0) || (in_whcn.z >= channel_range)) {
+      continue;
+    }
+
+    // Readjust for the source offset.
+    in_whcn.z = in_whcn.z + src_channel_offset;
+
+    ivec4 in_elem_pos = to_texture_elem_pos(in_whcn, in_sizes, packed_dim);
+    v[i] = VEC4_T(texelFetch(image_in, in_elem_pos.xyz, 0))[in_elem_pos.w];
+  }
+
+  imageStore(image_out, out_pos, v);
+}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/copy_channel_offset.yaml b/backends/vulkan/runtime/graph/ops/glsl/copy_channel_offset.yaml
@@ -0,0 +1,10 @@
+copy_channel_offset:
+  parameter_names_with_default_values:
+    DTYPE: float
+    NDIM: 3
+  generate_variant_forall:
+    DTYPE:
+      - VALUE: half
+      - VALUE: float
+  shader_variants:
+    - NAME: copy_channel_offset
diff --git a/backends/vulkan/runtime/graph/ops/glsl/copy_offset.glsl b/backends/vulkan/runtime/graph/ops/glsl/copy_offset.glsl
@@ -10,26 +10,12 @@
 
 #define PRECISION ${PRECISION}
 
-#define VEC4_T ${texel_type(DTYPE)}
-
 layout(std430) buffer;
 
-#include "indexing_utils.h"
-
 layout(set = 0, binding = 0, ${IMAGE_FORMAT[DTYPE]}) uniform PRECISION restrict writeonly ${IMAGE_T[NDIM][DTYPE]} image_out;
 layout(set = 0, binding = 1) uniform PRECISION sampler3D image_in;
 
-layout(set = 0, binding = 2) uniform PRECISION restrict OutLimits {
-  ivec3 out_limits;
-};
-
-layout(set = 0, binding = 3) uniform PRECISION restrict InLimits {
-  ivec3 in_limits;
-};
-
-
-
-layout(set = 0, binding = 4) uniform PRECISION restrict CopyArgs {
+layout(set = 0, binding = 2) uniform PRECISION restrict CopyArgs {
   ivec3 range;
   int unused0;
   ivec3 src_offset;

diff --git a/backends/vulkan/runtime/graph/ops/impl/Cat.cpp b/backends/vulkan/runtime/graph/ops/impl/Cat.cpp
@@ -0,0 +1,95 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/vulkan/runtime/graph/ops/OperatorRegistry.h>
+
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/Copy.h>
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/DimUtils.h>
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/KernelUtils.h>
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.h>
+#include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
+
+namespace vkcompute {
+
+void add_cat_default_node(
+    ComputeGraph& graph,
+    ValueRef in_list_ref,
+    ValueRef dim_ref,
+    ValueRef out) {
+  ValueListPtr input_list = graph.get_value_list(in_list_ref);
+
+  for (ValueRef input_ref : *input_list) {
+    vTensorPtr t_in = graph.get_tensor(input_ref);
+    VK_CHECK_COND(check_memory_layout_is(*t_in, api::kChannelsPacked));
+  }
+
+  int64_t dim = graph.extract_scalar<int64_t>(dim_ref);
+  vTensorPtr t_out = graph.get_tensor(out);
+
+  NchwDim nchw_dim = normalize_to_nchw_dim(*t_out, dim);
+
+  // TODO: Find ways to factor out the similar code for width, height, and batch
+  if (nchw_dim == DimWidth) {
+    api::utils::ivec3 src_offset = api::utils::make_ivec3({0, 0, 0}, false);
+    api::utils::ivec3 dst_offset = api::utils::make_ivec3({0, 0, 0}, false);
+
+    for (ValueRef input_ref : *input_list) {
+      vTensorPtr t_in = graph.get_tensor(input_ref);
+      api::utils::ivec3 range = t_in->texture_limits();
+      add_copy_offset_node(
+          graph, input_ref, range, src_offset, dst_offset, out);
+      dst_offset.data[0] += range.data[0];
+    }
+
+  } else if (nchw_dim == DimHeight) {
+    api::utils::ivec3 src_offset = api::utils::make_ivec3({0, 0, 0}, false);
+    api::utils::ivec3 dst_offset = api::utils::make_ivec3({0, 0, 0}, false);
+
+    for (ValueRef input_ref : *input_list) {
+      vTensorPtr t_in = graph.get_tensor(input_ref);
+      api::utils::ivec3 range = t_in->texture_limits();
+      add_copy_offset_node(
+          graph, input_ref, range, src_offset, dst_offset, out);
+      dst_offset.data[1] += range.data[1];
+    }
+  } else if (nchw_dim == DimBatch) {
+    api::utils::ivec3 src_offset = api::utils::make_ivec3({0, 0, 0}, false);
+    api::utils::ivec3 dst_offset = api::utils::make_ivec3({0, 0, 0}, false);
+
+    for (ValueRef input_ref : *input_list) {
+      vTensorPtr t_in = graph.get_tensor(input_ref);
+      api::utils::ivec3 range = t_in->texture_limits();
+      add_copy_offset_node(
+          graph, input_ref, range, src_offset, dst_offset, out);
+      dst_offset.data[2] += range.data[2];
+    }
+  } else if (nchw_dim == DimChannel) {
+    int32_t src_offset = 0;
+    int32_t dst_offset = 0;
+
+    for (ValueRef input_ref : *input_list) {
+      vTensorPtr t_in = graph.get_tensor(input_ref);
+      int32_t range = dim_at<Dim4D::Channel>(t_in->sizes());
+      add_copy_channel_offset_node(
+          graph, input_ref, range, src_offset, dst_offset, out);
+      dst_offset += range;
+    }
+  } else {
+    VK_THROW("Unexpected value of nchw_dim=", nchw_dim);
+  }
+}
+
+void cat_default(ComputeGraph& graph, const std::vector<ValueRef>& args) {
+  add_cat_default_node(graph, args[0], args[1], args[2]);
+}
+
+REGISTER_OPERATORS {
+  VK_REGISTER_OP(aten.cat.default, cat_default);
+}
+
+} // namespace vkcompute