2024-04-27 nightly release (dc726f9)

pytorch · Apr 27, 2024 · 9efaf0e · 9efaf0e
1 parent 8f03cf4
commit 9efaf0e
Show file tree

Hide file tree

Showing 23 changed files with 327 additions and 264 deletions.
diff --git a/.github/workflows/android.yml b/.github/workflows/android.yml
@@ -49,11 +49,18 @@ jobs:
         bash build/test_android_ci.sh
 
         mkdir -p artifacts-to-be-uploaded
+        mkdir -p artifacts-to-be-uploaded/arm64-v8a/
+        mkdir -p artifacts-to-be-uploaded/x86_64/
+        # Copy the jar to S3
+        cp extension/android/build/libs/executorch.jar artifacts-to-be-uploaded/
         # Copy the app and its test suite to S3
         cp examples/demo-apps/android/LlamaDemo/app/build/outputs/apk/debug/*.apk artifacts-to-be-uploaded/
         cp examples/demo-apps/android/LlamaDemo/app/build/outputs/apk/androidTest/debug/*.apk artifacts-to-be-uploaded/
-        # Also copy the share libraries
-        cp cmake-out-android/lib/*.a artifacts-to-be-uploaded/
+        # Also copy the libraries
+        cp cmake-out-android-arm64-v8a/lib/*.a artifacts-to-be-uploaded/arm64-v8a/
+        cp cmake-out-android-arm64-v8a/extension/android/*.so artifacts-to-be-uploaded/arm64-v8a/
+        cp cmake-out-android-x86_64/lib/*.a artifacts-to-be-uploaded/x86_64/
+        cp cmake-out-android-x86_64/extension/android/*.so artifacts-to-be-uploaded/x86_64/
 
   # Upload the app and its test suite to S3 so that they can be downloaded by the test job
   upload-artifacts:

diff --git a/README.md b/README.md
@@ -20,31 +20,26 @@ Key value propositions of ExecuTorch are:
 For a comprehensive technical overview of ExecuTorch and step-by-step tutorials,
 please visit our documentation website [for the latest release](https://pytorch.org/executorch/stable/index.html) (or the [main branch](https://pytorch.org/executorch/main/index.html)).
 
-## Important: This is a preview release
+## Feedback
 
-This is a preview version of ExecuTorch and should be used for testing and
-evaluation purposes only. It is not recommended for use in production settings.
 We welcome any feedback, suggestions, and bug reports from the community to help
-us improve the technology. Please use the [PyTorch
+us improve our technology. Please use the [PyTorch
 Forums](https://discuss.pytorch.org/c/executorch) for discussion and feedback
 about ExecuTorch using the **ExecuTorch** category, and our [GitHub
 repository](https://github.com/pytorch/executorch/issues) for bug reporting.
 
-The ExecuTorch code and APIs are still changing quickly, and there are not yet
-any guarantees about forward/backward source compatibility. We recommend using
-the latest `v#.#.#` release tag from the
-[Releases](https://github.com/pytorch/executorch/releases) page when
-experimenting with this preview release.
+We recommend using the latest release tag from the
+[Releases](https://github.com/pytorch/executorch/releases) page when developing.
 
 ## Directory Structure
 
 ```
 executorch
 ├── backends                        #  Backend delegate implementations.
 ├── build                           #  Utilities for managing the build system.
-├── bundled_program                 #  Utilities for attaching reference inputs and outputs to models. TODO move to extension
-├── codegen                         #  Tooling to autogenerate bindings between kernels and the runtime. TODO move to tool
-├── configurations                  #  TODO delete this
+├── bundled_program                 #  Utilities for attaching reference inputs and outputs to models.
+├── codegen                         #  Tooling to autogenerate bindings between kernels and the runtime.
+├── configurations
 ├── docs                            #  Static docs tooling
 ├── examples                        #  Examples of various user flows, such as model export, delegates, and runtime execution.
 ├── exir                            #  Ahead of time library, model capture and lowering apis.
@@ -69,20 +64,20 @@ executorch
 |   ├── portable                    #  Reference implementations of ATen operators.
 |   ├── prim_ops                    #  Special ops used in executorch runtime for control flow and symbolic primitives.
 |   ├── quantized
-├── profiler                        #  Utilities for profiling. TODO delete in favor of ETDump in sdk/
-├── runtime                         #  core cpp runtime of executorch
+├── profiler                        #  Utilities for profiling.
+├── runtime                         #  Core cpp runtime
 |   ├── backend                     #  Backend delegate runtime APIs
 |   ├── core                        #  Core structures used across all levels of the runtime
 |   ├── executor                    #  Model loading, initalization, and execution.
 |   ├── kernel                      #  Kernel registration and management.
 |   ├── platform                    #  Layer between architecture specific code and user calls.
-├── schema                          #  ExecuTorch program definition, TODO move under serialization/
+├── schema                          #  ExecuTorch program definition
 ├── scripts                         #  Utility scripts for size management, dependency management, etc.
 ├── sdk                             #  Model profiling, debugging, and introspection.
 ├── shim                            #  Compatibility layer between OSS and Internal builds
 ├── test                            #  Broad scoped end2end tests
-├── third-party                     #  third-party dependencies
-├── util                            #  TODO delete this
+├── third-party                     #  Third-party dependencies
+├── util
 ```
 
 ## License

diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_prepack_weights.glsl b/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_prepack_weights.glsl
@@ -26,49 +26,23 @@ layout(set = 0, binding = 1) buffer  PRECISION restrict readonly Buffer {
   BUF_T buffer_in[];
 };
 
-// Corresponds to {1,4,3,9} in the example below.
 layout(set = 0, binding = 2) uniform PRECISION restrict Sizes {
   ivec4 sizes;
 };
 
-// Corresponds to {3,3,1,11} in the example below.
 layout(set = 0, binding = 3) uniform PRECISION restrict OriginalSizes {
   ivec4 original_sizes;
 };
 
-// Corresponds to {1,12} in the example below.
-layout(set = 0, binding = 4) uniform PRECISION restrict PaddedSizes {
-  ivec2 padded_sizes;
-};
-
 layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 
 layout(constant_id = 3) const int packed_dim = C_DIM;
 
 /*
  * Computes special prepacking for a depthwise convolution. Each shader invocation
  * calculates the input buffer location to read into the desired texel. This
- * packing was originally developed on CPU and that approach is described in the
- * rest of this comment. Refer to the code-level comments, for how we translate
- * it to GPU by reversing the steps.
- *
- * Consider an example weight tensor of size {11,1,3,3}. The following
- * transformations will be applied.
- *
- * 1. Pad the N dim so that it is a multiple of 4. In this case, 1
- * batch of padding is added, producing a tensor of size {12,1,3,3}.
- *      at::pad(x, {0,0,0,0,0,0,0,1}, "constant", 0);
- *
- * 2. Flatten the last two dims by reshaping the tensor:
- *      x.reshape({12,1,9});
- *
- * 3. "Fold" the N dim into the C dim. Split the tensor along the N dim so that
- * each split has 4 channels.
- *      x.reshape({3,4,1,9});
- *
- * 4. Stack the batches on each other vertically by permuting the N and C dims
- * and reshaping the tensor.
- *      x.permute({1,0,2,3}).reshape({4,3,9});
+ * packing was originally developed on CPU here:
+ * https://github.com/pytorch/pytorch/blob/d63e7d0aa2e0a1b1fd7518f917224774afe97bae/aten/src/ATen/native/vulkan/ops/Convolution.cpp#L58-L118
  */
 void main() {
   const ivec3 pos = ivec3(gl_GlobalInvocationID);
@@ -78,39 +52,40 @@ void main() {
     return;
   }
 
-  // As in usual staging shaders, map from GPU texel position to normal CPU
-  // buffer indices: (9,3) -> (4,3,9)
+  // Map tensor_idx to normal buffer_i
   const ivec4 p0 = get_texel_nchw_buffer_ixs(idx, sizes, packed_dim);
 
-  // Re-map the normal CPU buffer indices to special indices, through a series
-  // of mappings: reshape is a no-op to the underlying indices, so we only map
-  // for pad and permute.
-  const int Np = padded_sizes.x;
+  // Compute modified tensor_idx by inverting the CPU function
   const int N = original_sizes.w;
   const int C = original_sizes.z;
   const int H = original_sizes.y;
   const int W = original_sizes.x;
+  const int Y = sizes.y;
+
+  const ivec4 p1 = p0 / W;
+  const ivec4 p2 = p1 / H;
 
-  // Undo step 3 permute: (4,3,1,9) -> (3,4,1,9)
-  const ivec4 p1 = swap_adj_dims(p0, 4, (Np / 4), (C * H * W));
+  const ivec4 n = (p2 % Y) * 4 + (p2 / Y);
+  const ivec4 h = p1 % H;
+  const ivec4 w = p0 % W;
 
-  // Undo step 1 pad: (12,1,3,3) -> (11,1,3,3)
-  // For values in the padded region, write zero instead of buffer data.
-  const ivec4 n = p1 / (C * H * W);
-  const ivec4 mask = ivec4(greaterThanEqual(n, ivec4(N)));
+  // Map modified tensor_idx to modifed buffer_i
+  // Zero out if modified tensor idx is out of bounds
+  const ivec4 buf_i = n * C*H*W + h * W + w;
+  const bvec4 mask = bvec4(lessThan(n, ivec4(N)));
 
   VEC4_T texel = VEC4_T(0);
-  if (mask.x == 0) {
-    texel.x = SCALAR_T(buffer_in[p1.x]);
+  if (mask.x) {
+    texel.x = SCALAR_T(buffer_in[buf_i.x]);
   }
-  if (mask.y == 0) {
-    texel.y = SCALAR_T(buffer_in[p1.y]);
+  if (mask.y) {
+    texel.y = SCALAR_T(buffer_in[buf_i.y]);
   }
-  if (mask.z == 0) {
-    texel.z = SCALAR_T(buffer_in[p1.z]);
+  if (mask.z) {
+    texel.z = SCALAR_T(buffer_in[buf_i.z]);
   }
-  if (mask.w == 0) {
-    texel.w = SCALAR_T(buffer_in[p1.w]);
+  if (mask.w) {
+    texel.w = SCALAR_T(buffer_in[buf_i.w]);
   }
 
   imageStore(image_out, pos.xy, texel);

diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_prepack_weights.glsl b/backends/vulkan/runtime/graph/ops/glsl/conv2d_prepack_weights.glsl
@@ -26,63 +26,23 @@ layout(set = 0, binding = 1) buffer  PRECISION restrict readonly Buffer {
   BUF_T buffer_in[];
 };
 
-// Corresponds to {1,4,9,24} in the example below.
 layout(set = 0, binding = 2) uniform PRECISION restrict Sizes {
   ivec4 sizes;
 };
 
-// Corresponds to {3,3,7,10} in the example below.
 layout(set = 0, binding = 3) uniform PRECISION restrict OriginalSizes {
   ivec4 original_sizes;
 };
 
-// Corresponds to {8,12} in the example below.
-layout(set = 0, binding = 4) uniform PRECISION restrict PaddedSizes {
-  ivec2 padded_sizes;
-};
-
 layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 
 layout(constant_id = 3) const int packed_dim = C_DIM;
 
 /*
  * Computes special prepacking for a 2D convolution. Each shader invocation
- * calculates the input buffer location to read into the desired texel. This
- * packing was originally developed on CPU and that approach is described in the
- * rest of this comment. Refer to the code-level comments, for how we translate
- * it to GPU by reversing the steps.
- *
- * Consider an example weight tensor of size {10,7,3,3}. The following
- * transformations will be applied.
- *
- * 1. Pad the N and C dims so that both are a multiple of 4. In this case, 2
- * batches and 1 channel of padding are added, producing a tensor of size
- * {12,8,3,3}.
- *      at::pad(x, {0,0,0,0,0,1,0,2}, "constant", 0);
- *
- * 2. Split the tensor along the C dim so that each split has 4 channels.
- *      x.reshape({12,2,4,3,3});
- *
- * 3. For each split, "fold" the C dim into the W dim. Suppose the first rows
- * at H=0 of the split have values
- *    0,1,2 | 10,11,12 | 20,21,22 | 30,31,32
- *
- * where | denotes a channel boundary. Then, the goal is to combine those rows
- * into one row with the values
- *    0, 10, 20, 30, 1, 11, 21, 31, 2, 12, 22, 32
- *
- *      x.permute({0,1,3,4,2}).reshape({12,2,3,12});
- *
- * 4. Stack the splits belonging to the same batch horizontally by swapping the
- * C and H dims.
- *      x.permute({0,2,1,3}).reshape({12,3,24});
- *
- * 5. Repeat a similar process to "fold" the N dim into the C dim. Split along
- * the N dim so that each split has 4 batches.
- *      x.reshape({3,4,3,24});
- *
- * 6. Stack the batches on each other vertically by swapping the N and C dims.
- *      x.permute({1,0,2,3}).reshape({4,9,24});
+ * calculates the input buffer locations to read into the desired texel. This
+ * packing was originally developed on CPU here:
+ * https://github.com/pytorch/pytorch/blob/d63e7d0aa2e0a1b1fd7518f917224774afe97bae/aten/src/ATen/native/vulkan/ops/Convolution.cpp#L120-L211
  */
 void main() {
   const ivec3 pos = ivec3(gl_GlobalInvocationID);
@@ -92,49 +52,44 @@ void main() {
     return;
   }
 
-  // As in usual staging shaders, map from GPU texel position to normal CPU
-  // buffer indices: (24,9) -> (4,9,24)
+  // Map tensor_idx to normal buffer_i
   const ivec4 p0 = get_texel_nchw_buffer_ixs(idx, sizes, packed_dim);
 
-  // Re-map the normal CPU buffer indices to special indices, through a series
-  // of mappings: reshape is a no-op to the underlying indices, so we only map
-  // for pad and permute.
-  const int Np = padded_sizes.y;
-  const int Cp = padded_sizes.x;
+  // Compute modified tensor_idx by inverting the CPU function
   const int N = original_sizes.w;
   const int C = original_sizes.z;
   const int H = original_sizes.y;
   const int W = original_sizes.x;
+  const int J = sizes.x / (4*W);
+  const int K = sizes.y / H;
+
+  const ivec4 p1 = p0 / 4;
+  const ivec4 p2 = p1 / W;
+  const ivec4 p3 = p2 / J;
+  const ivec4 p4 = p3 / H;
+
+  const ivec4 n = (p4 % K) * 4 + (p4 / K);
+  const ivec4 c = (p2 % J) * 4 + (p0 % 4);
+  const ivec4 h = p3 % H;
+  const ivec4 w = p1 % W;
 
-  // Undo step 6 premute: (4,3,3,24) -> (3,4,3,24)
-  // Undo step 4 permute: (12,3,2,12) -> (12,2,3,12)
-  // Undo step 3 permute, part 1: (12,2,3h,3w,4) -> (12,2,3h,4,3w)
-  // Undo step 3 permute, part 2: (12,2,3h,4,3w) -> (12,2,4,3h,3w)
-  const ivec4 p1 = swap_adj_dims(p0, 4, (Np / 4), (H * Cp * W));
-  const ivec4 p2 = swap_adj_dims(p1, H, (Cp / 4), (W * 4));
-  const ivec4 p3 = swap_adj_dims(p2, W, 4, 1);
-  const ivec4 p4 = swap_adj_dims(p3, H, 4, W);
-
-  // Undo step 1 pad: (12,8,3,3) -> (10,7,3,3)
-  // For values in the padded region, write zero instead of buffer data.
-  const ivec4 c = p4 % (Cp * H * W) / (H * W);
-  const ivec4 n = p4 / (Cp * H * W);
-  const ivec4 p5 = p4 - n * (Cp - C) * H * W;
-  const ivec4 mask = ivec4(greaterThanEqual(c, ivec4(C))) |
-      ivec4(greaterThanEqual(n, ivec4(N)));
+  // Map modified tensor_idx to modified buffer_i
+  // Zero out if modified tensor idx is out of bounds
+  const ivec4 buf_i = n * C*H*W + c * H*W + h * W + w;
+  const bvec4 mask = bvec4(ivec4(lessThan(n, ivec4(N))) & ivec4(lessThan(c, ivec4(C))));
 
   VEC4_T texel = VEC4_T(0);
-  if (mask.x == 0) {
-    texel.x = SCALAR_T(buffer_in[p5.x]);
+  if (mask.x) {
+    texel.x = SCALAR_T(buffer_in[buf_i.x]);
   }
-  if (mask.y == 0) {
-    texel.y = SCALAR_T(buffer_in[p5.y]);
+  if (mask.y) {
+    texel.y = SCALAR_T(buffer_in[buf_i.y]);
   }
-  if (mask.z == 0) {
-    texel.z = SCALAR_T(buffer_in[p5.z]);
+  if (mask.z) {
+    texel.z = SCALAR_T(buffer_in[buf_i.z]);
   }
-  if (mask.w == 0) {
-    texel.w = SCALAR_T(buffer_in[p5.w]);
+  if (mask.w) {
+    texel.w = SCALAR_T(buffer_in[buf_i.w]);
   }
 
   imageStore(image_out, pos.xy, texel);