Skip to content

Commit

Permalink
2024-04-30 nightly release (38fad23)
Browse files Browse the repository at this point in the history
  • Loading branch information
pytorchbot committed Apr 30, 2024
1 parent 8ba0eff commit 2a03600
Show file tree
Hide file tree
Showing 33 changed files with 1,494 additions and 593 deletions.
4 changes: 2 additions & 2 deletions .github/workflows/doc-build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -94,11 +94,11 @@ jobs:
# Get github.ref for the output doc folder. By default "main"
# If matches a tag like refs/tags/v1.12.0-rc3 or
# refs/tags/v1.12.0 convert to 1.12
GITHUB_REF=${{ github.ref }}
export GITHUB_REF=${{ github.ref }}
# Convert refs/tags/v1.12.0rc3 into 1.12.
# Adopted from https://github.com/pytorch/pytorch/blob/main/.github/workflows/_docs.yml#L150C11-L155C13
if [[ "${GITHUB_REF}" =~ ^refs/tags/v([0-9]+\\.[0-9]+)\\. ]]; then
if [[ "${GITHUB_REF}" =~ ^refs/tags/v([0-9]+\.[0-9]+) ]]; then
TARGET_FOLDER="${BASH_REMATCH[1]}"
else
TARGET_FOLDER="main"
Expand Down
64 changes: 64 additions & 0 deletions .github/workflows/pull.yml
Original file line number Diff line number Diff line change
Expand Up @@ -239,6 +239,70 @@ jobs:
# see if we can import the module successfully
python -c "from executorch.extension.pybindings import portable_lib; print('success!')"
test-binary-size-linux-gcc:
name: test-binary-size-linux-gcc
uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
strategy:
fail-fast: false
with:
runner: linux.2xlarge
docker-image: executorch-ubuntu-22.04-gcc9
submodules: 'true'
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
timeout: 90
script: |
# The generic Linux job chooses to use base env, not the one setup by the image
CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
conda activate "${CONDA_ENV}"
# build module for executorch.extension.pybindings.portable_lib
bash test/build_size_test.sh
strip cmake-out/test/size_test
output=$(ls -la cmake-out/test/size_test)
arr=($output)
size=${arr[4]}
# threshold=48120 on devserver with gcc11.4
# todo(lfq): update once binary size is below 50kb.
threshold="51504"
if [[ "$size" -le "$threshold" ]]; then
echo "Success $size <= $threshold"
else
echo "Fail $size > $threshold"
exit 1
fi
test-binary-size-linux:
name: test-binary-size-linux
uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
strategy:
fail-fast: false
with:
runner: linux.2xlarge
docker-image: executorch-ubuntu-22.04-clang12
submodules: 'true'
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
timeout: 90
script: |
# The generic Linux job chooses to use base env, not the one setup by the image
CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
conda activate "${CONDA_ENV}"
# build module for executorch.extension.pybindings.portable_lib
bash test/build_size_test.sh
strip cmake-out/test/size_test
output=$(ls -la cmake-out/test/size_test)
arr=($output)
size=${arr[4]}
# threshold=48120 on devserver with gcc11.4
# todo(lfq): update once binary size is below 50kb.
threshold="51768"
if [[ "$size" -le "$threshold" ]]; then
echo "Success $size <= $threshold"
else
echo "Fail $size > $threshold"
exit 1
fi
unittest:
uses: ./.github/workflows/_unittest.yml
with:
Expand Down
2 changes: 1 addition & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -120,7 +120,7 @@ endif()
# disables exceptions and runtime type.
set(CMAKE_CXX_FLAGS_RELEASE
"-ffunction-sections -fdata-sections -fno-exceptions -fno-rtti")
if(NOT APPLE)
if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -s")
endif()

Expand Down
78 changes: 78 additions & 0 deletions backends/vulkan/runtime/graph/ops/glsl/copy_channel_offset.glsl
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
/*
* Copyright (c) Meta Platforms, Inc. and affiliates.
* All rights reserved.
*
* This source code is licensed under the BSD-style license found in the
* LICENSE file in the root directory of this source tree.
*/

#version 450 core

#define PRECISION ${PRECISION}

#define VEC4_T ${texel_type(DTYPE)}

layout(std430) buffer;

#include "indexing_utils.h"

layout(set = 0, binding = 0, ${IMAGE_FORMAT[DTYPE]}) uniform PRECISION restrict writeonly ${IMAGE_T[NDIM][DTYPE]} image_out;
layout(set = 0, binding = 1) uniform PRECISION sampler3D existing_out;
layout(set = 0, binding = 2) uniform PRECISION sampler3D image_in;

layout(set = 0, binding = 3) uniform PRECISION restrict CopyArgs {
ivec4 out_sizes;
ivec4 in_sizes;
// Analogus to range variable in copy. It defines the # of channel being
// copied.
int channel_range;
int src_channel_offset;
int dst_channel_offset;
int unused;
// Operates on (x, y, z) extents.
ivec3 range;
int unused1;
ivec3 dst_offset;
int unused2;
};

layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;

layout(constant_id = 3) const int packed_dim = C_DIM;

void main() {
// Note: Unlike other shaders, the range is often not equal to the destination
// texture extent.
const ivec3 pos = ivec3(gl_GlobalInvocationID);
if (any(greaterThanEqual(pos, range))) {
return;
}

const ivec3 out_pos = pos + dst_offset;

const ivec4 out_whcn = to_tensor_idx(out_pos, out_sizes, packed_dim);

// First read the existing values to make sure the boundary values stay.
VEC4_T v = VEC4_T(texelFetch(existing_out, out_pos, 0));

for (int i=0; i<4; i++) {
ivec4 in_whcn = out_whcn;

in_whcn.z = out_whcn.z - dst_channel_offset + i;

// Handle the partial update for begining of channel in an existing tensor.
// If the source channel index is below zero or exceeds the range, we skip
// updating the element to avoid overwriting existing data.
if ((in_whcn.z < 0) || (in_whcn.z >= channel_range)) {
continue;
}

// Readjust for the source offset.
in_whcn.z = in_whcn.z + src_channel_offset;

ivec4 in_elem_pos = to_texture_elem_pos(in_whcn, in_sizes, packed_dim);
v[i] = VEC4_T(texelFetch(image_in, in_elem_pos.xyz, 0))[in_elem_pos.w];
}

imageStore(image_out, out_pos, v);
}
10 changes: 10 additions & 0 deletions backends/vulkan/runtime/graph/ops/glsl/copy_channel_offset.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
copy_channel_offset:
parameter_names_with_default_values:
DTYPE: float
NDIM: 3
generate_variant_forall:
DTYPE:
- VALUE: half
- VALUE: float
shader_variants:
- NAME: copy_channel_offset
16 changes: 1 addition & 15 deletions backends/vulkan/runtime/graph/ops/glsl/copy_offset.glsl
Original file line number Diff line number Diff line change
Expand Up @@ -10,26 +10,12 @@

#define PRECISION ${PRECISION}

#define VEC4_T ${texel_type(DTYPE)}

layout(std430) buffer;

#include "indexing_utils.h"

layout(set = 0, binding = 0, ${IMAGE_FORMAT[DTYPE]}) uniform PRECISION restrict writeonly ${IMAGE_T[NDIM][DTYPE]} image_out;
layout(set = 0, binding = 1) uniform PRECISION sampler3D image_in;

layout(set = 0, binding = 2) uniform PRECISION restrict OutLimits {
ivec3 out_limits;
};

layout(set = 0, binding = 3) uniform PRECISION restrict InLimits {
ivec3 in_limits;
};



layout(set = 0, binding = 4) uniform PRECISION restrict CopyArgs {
layout(set = 0, binding = 2) uniform PRECISION restrict CopyArgs {
ivec3 range;
int unused0;
ivec3 src_offset;
Expand Down
95 changes: 95 additions & 0 deletions backends/vulkan/runtime/graph/ops/impl/Cat.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
/*
* Copyright (c) Meta Platforms, Inc. and affiliates.
* All rights reserved.
*
* This source code is licensed under the BSD-style license found in the
* LICENSE file in the root directory of this source tree.
*/

#include <executorch/backends/vulkan/runtime/graph/ops/OperatorRegistry.h>

#include <executorch/backends/vulkan/runtime/graph/ops/impl/Copy.h>
#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/DimUtils.h>
#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/KernelUtils.h>
#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.h>
#include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>

namespace vkcompute {

void add_cat_default_node(
ComputeGraph& graph,
ValueRef in_list_ref,
ValueRef dim_ref,
ValueRef out) {
ValueListPtr input_list = graph.get_value_list(in_list_ref);

for (ValueRef input_ref : *input_list) {
vTensorPtr t_in = graph.get_tensor(input_ref);
VK_CHECK_COND(check_memory_layout_is(*t_in, api::kChannelsPacked));
}

int64_t dim = graph.extract_scalar<int64_t>(dim_ref);
vTensorPtr t_out = graph.get_tensor(out);

NchwDim nchw_dim = normalize_to_nchw_dim(*t_out, dim);

// TODO: Find ways to factor out the similar code for width, height, and batch
if (nchw_dim == DimWidth) {
api::utils::ivec3 src_offset = api::utils::make_ivec3({0, 0, 0}, false);
api::utils::ivec3 dst_offset = api::utils::make_ivec3({0, 0, 0}, false);

for (ValueRef input_ref : *input_list) {
vTensorPtr t_in = graph.get_tensor(input_ref);
api::utils::ivec3 range = t_in->texture_limits();
add_copy_offset_node(
graph, input_ref, range, src_offset, dst_offset, out);
dst_offset.data[0] += range.data[0];
}

} else if (nchw_dim == DimHeight) {
api::utils::ivec3 src_offset = api::utils::make_ivec3({0, 0, 0}, false);
api::utils::ivec3 dst_offset = api::utils::make_ivec3({0, 0, 0}, false);

for (ValueRef input_ref : *input_list) {
vTensorPtr t_in = graph.get_tensor(input_ref);
api::utils::ivec3 range = t_in->texture_limits();
add_copy_offset_node(
graph, input_ref, range, src_offset, dst_offset, out);
dst_offset.data[1] += range.data[1];
}
} else if (nchw_dim == DimBatch) {
api::utils::ivec3 src_offset = api::utils::make_ivec3({0, 0, 0}, false);
api::utils::ivec3 dst_offset = api::utils::make_ivec3({0, 0, 0}, false);

for (ValueRef input_ref : *input_list) {
vTensorPtr t_in = graph.get_tensor(input_ref);
api::utils::ivec3 range = t_in->texture_limits();
add_copy_offset_node(
graph, input_ref, range, src_offset, dst_offset, out);
dst_offset.data[2] += range.data[2];
}
} else if (nchw_dim == DimChannel) {
int32_t src_offset = 0;
int32_t dst_offset = 0;

for (ValueRef input_ref : *input_list) {
vTensorPtr t_in = graph.get_tensor(input_ref);
int32_t range = dim_at<Dim4D::Channel>(t_in->sizes());
add_copy_channel_offset_node(
graph, input_ref, range, src_offset, dst_offset, out);
dst_offset += range;
}
} else {
VK_THROW("Unexpected value of nchw_dim=", nchw_dim);
}
}

void cat_default(ComputeGraph& graph, const std::vector<ValueRef>& args) {
add_cat_default_node(graph, args[0], args[1], args[2]);
}

REGISTER_OPERATORS {
VK_REGISTER_OP(aten.cat.default, cat_default);
}

} // namespace vkcompute
Loading

0 comments on commit 2a03600

Please sign in to comment.