Skip to content

Commit

Permalink
Removing skip_for_blackhole and fix some minor issues for blackhole c…
Browse files Browse the repository at this point in the history
…onv2d (#17222)

### Issues
#17221
#17216


### Problem description
Enable conv2d blackhole tests - to begin with, remove the skips in
`tests/ttnn/unit_tests/operations/test_new_conv2d.py` and provide a
report about state of tests. Fix minor bugs described in issues.

### What's changed

- Removed `@skip_for_blackhole()` for tests that already run on
wormhole_b0
- Removed skip for tests that needs 8x8 grid for blackhole - it caused
OOM for 8x7 grid on wormhole_b0, it should not be case on blackhole
- Removed some unused imports and functions from 
- Changed alignment of an output tensor in `calculate_L1_usage` function
- Removed fatals for num_cores in width sharded factory
- Added workaround for shallow convs (from
[#17058](#17058), fyi
@mywoodstock)
- Changed act_block_w_div from 2 to 1 in some tests (this value is
correlated with the number of input channels and grid size; 2 becomes
invalid in these cases for blackhole due to the increased grid size)

### State of blackhole tests after this changes
repro: `pytest tests/ttnn/unit_tests/operations/test_new_conv2d.py`

PASSED: 2301
FAILED: 8 
99.7% pass rate

Of the tests that fail: 
8 of them fail because of PCC error - #17226

### Checklist
- [x] Post commit CI passes -
https://github.com/tenstorrent/tt-metal/actions/runs/13058074023
- [ ] Blackhole Post commit (if applicable)
- [x] Model regression CI testing passes (if applicable) -
https://github.com/tenstorrent/tt-metal/actions/runs/13070618785
- [ ] Device performance regression CI testing passes (if applicable)
- [ ] **(For models and ops writers)** Full [new
models](https://github.com/tenstorrent/tt-metal/actions/workflows/full-new-models-suite.yaml)
tests passes
- [ ] New/Existing tests provide coverage for changes
  • Loading branch information
skrsticTT authored Jan 31, 2025
1 parent 2d624db commit 25ee758
Show file tree
Hide file tree
Showing 4 changed files with 23 additions and 45 deletions.
43 changes: 4 additions & 39 deletions tests/ttnn/unit_tests/operations/test_new_conv2d.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,38 +7,15 @@
import torch
import pytest
from models.utility_functions import (
is_wormhole_b0,
skip_for_grayskull,
is_grayskull,
is_wormhole_b0,
is_x2_harvested,
is_blackhole,
skip_for_blackhole,
is_blackhole,
)
from tests.ttnn.utils_for_testing import assert_with_pcc, check_with_pcc, check_with_pcc_without_tensor_printout
import ttnn


def _nearest_32(x):
return math.ceil(x / 32) * 32


from tests.ttnn.ttnn_utility_fuction import get_shard_grid_from_num_cores

# def plot_diff(vals, fid, nsticks, stick_len):
# import matplotlib.pyplot as plt

# plt.clf()
# plt.figure(figsize=(100, 50))
# plt.xticks(torch.arange(0, stick_len) + 0.5, range(0, stick_len))
# plt.yticks(torch.arange(0, nsticks) + 0.5, range(0, nsticks))
# # plt.grid()
# bool_vals = vals > 0
# plt.imshow(bool_vals, interpolation="none", vmin=0, vmax=1, cmap="Blues")
# plt.savefig(f"diff_core_{fid}.png", bbox_inches="tight", pad_inches=0.1)
# plt.close()


def run_conv(
device,
Expand Down Expand Up @@ -445,7 +422,6 @@ def test_conv_features(


@skip_for_grayskull()
@skip_for_blackhole()
@pytest.mark.parametrize("device_params", [{"l1_small_size": 2 * 16384}], indirect=True)
@pytest.mark.parametrize("groups", [1, 2])
@pytest.mark.parametrize("stride", [2])
Expand Down Expand Up @@ -536,7 +512,7 @@ def test_conv_features_multi_device(
(2, 256, 2048, 9, 9, 3, 3, 1, 1, 1),
(2, 512, 2048, 17, 17, 3, 3, 1, 1, 1),
(2, 768, 768, 17, 17, 3, 3, 0, 0, 1),
(2, 1280, 2560, 15, 15, 3, 3, 1, 1, 2),
(2, 1280, 2560, 15, 15, 3, 3, 1, 1, 1),
(2, 1280, 1280, 17, 17, 3, 3, 1, 1, 1),
[1, 3024, 1232, 14, 14, 1, 1, 0, 0, 1],
(2, 768, 32, 9, 9, 3, 3, 1, 1, 1),
Expand Down Expand Up @@ -578,8 +554,8 @@ def test_conv_ws(
auto_shard,
tilized_input,
):
if device.core_grid.y != 8:
pytest.skip("Needs 8x8 Grid")
if device.core_grid.y != 8 and is_wormhole_b0():
pytest.skip("Needs 8x8 grid for wormhole_b0")

stride_h = stride
stride_w = stride
Expand Down Expand Up @@ -833,9 +809,6 @@ def test_resnet50_conv_gs(
config_override,
auto_shard,
):
if is_blackhole():
pytest.skip("This test is for Grayskull only")

if batch_size > 8 and (activations_dtype != ttnn.bfloat8_b or weights_dtype != ttnn.bfloat8_b):
pytest.skip("Batch > 8 must be run fully bfp8")
if batch_size == 20 and input_channels >= 128 and filter_width > 1:
Expand Down Expand Up @@ -879,7 +852,6 @@ def test_resnet50_conv_gs(


@skip_for_grayskull()
@skip_for_blackhole()
@pytest.mark.parametrize("device_params", [{"l1_small_size": 16384}], indirect=True)
@pytest.mark.parametrize(
"batch_size, output_channels, input_channels, input_height, input_width, filter_height, filter_width, stride_h, stride_w, pad_h, pad_w, use_1d_systolic_array, config_override",
Expand Down Expand Up @@ -1017,7 +989,6 @@ def test_resnet50_conv_wh(


@skip_for_grayskull()
@skip_for_blackhole()
@pytest.mark.parametrize("device_params", [{"l1_small_size": 16384}], indirect=True)
@pytest.mark.parametrize(
"batch_size, output_channels, input_channels, input_height, input_width, filter_height, filter_width, stride_h, stride_w, pad_h, pad_w, use_1d_systolic_array, config_override",
Expand Down Expand Up @@ -1336,7 +1307,6 @@ def test_sd_conv(


@skip_for_grayskull()
@skip_for_blackhole()
@pytest.mark.parametrize("device_params", [{"l1_small_size": 16384}], indirect=True)
@pytest.mark.parametrize(
"batch_size, output_channels, input_channels, input_height, input_width, filter_height, filter_width, stride_h, stride_w, pad_h, pad_w, use_1d_systolic_array, config_override",
Expand Down Expand Up @@ -1607,7 +1577,6 @@ def test_unet_conv(


@skip_for_grayskull()
@skip_for_blackhole()
@pytest.mark.parametrize("device_params", [{"l1_small_size": 16384}], indirect=True)
@pytest.mark.parametrize(
"batch_size, output_channels, input_channels, input_height, input_width, filter_height, filter_width, stride_h, stride_w, pad_h, pad_w, use_1d_systolic_array, config_override, use_shallow_conv_variant",
Expand Down Expand Up @@ -1700,7 +1669,6 @@ def test_unet_conv_wh(


@skip_for_grayskull()
@skip_for_blackhole()
@pytest.mark.parametrize(
"batch_size",
[1],
Expand Down Expand Up @@ -1800,7 +1768,6 @@ def test_unet_conv_groups_2_wh(


@skip_for_grayskull()
@skip_for_blackhole()
@pytest.mark.parametrize(
"batch_size",
[1],
Expand Down Expand Up @@ -1900,7 +1867,6 @@ def test_unet_conv_groups_4_6_wh(


@skip_for_grayskull()
@skip_for_blackhole()
@pytest.mark.parametrize(
"batch_size",
[1],
Expand Down Expand Up @@ -2131,7 +2097,7 @@ def test_conv_core_nondivis(
(768, 768, 16, 16, 1, ttnn.TensorMemoryLayout.WIDTH_SHARDED),
(1280, 1280, 16, 16, 1, ttnn.TensorMemoryLayout.WIDTH_SHARDED),
(1280, 1280, 8, 8, 1, ttnn.TensorMemoryLayout.WIDTH_SHARDED),
(1280, 2560, 8, 8, 2, ttnn.TensorMemoryLayout.WIDTH_SHARDED),
(1280, 2560, 8, 8, 1, ttnn.TensorMemoryLayout.WIDTH_SHARDED),
(128, 128, 8, 8, 1, ttnn.TensorMemoryLayout.BLOCK_SHARDED),
(128, 128, 16, 16, 1, ttnn.TensorMemoryLayout.BLOCK_SHARDED),
(128, 128, 32, 32, 1, ttnn.TensorMemoryLayout.BLOCK_SHARDED),
Expand Down Expand Up @@ -2718,7 +2684,6 @@ def test_conv_for_vanilla_unet(
)


@skip_for_blackhole()
@pytest.mark.parametrize("device_params", [{"l1_small_size": 16384}], indirect=True)
@pytest.mark.parametrize(
"batch_size, output_channels, input_channels, input_height, input_width, filter_height, filter_width, stride_h, stride_w, pad_h, pad_w, use_1d_systolic_array, config_override",
Expand Down
9 changes: 7 additions & 2 deletions ttnn/cpp/ttnn/operations/conv/conv2d/conv2d_utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,14 @@
// SPDX-License-Identifier: Apache-2.0

#include <sys/types.h>
#include <algorithm>
#include <cstdint>
#include <optional>
#include <tuple>

#include "conv2d_utils.hpp"
#include <tt-metalium/buffer_constants.hpp>
#include "tt-metalium/hal.hpp"
#include "ttnn/operations/conv/conv2d/device/conv2d_op.hpp"
#include "ttnn/operations/conv/conv2d/prepare_conv2d_weights.hpp"
#include "ttnn/operations/core/compute_kernel/compute_kernel_config.hpp"
Expand Down Expand Up @@ -983,6 +985,9 @@ conv_op_l1_usage conv2d::calculate_L1_usage(
(per_core_out_matrix_height_ntiles + act_block_h_ntiles - 1) / act_block_h_ntiles;
uint32_t out_block_h_ntiles_padded = num_blocks_act_h_per_core * act_block_h_ntiles;

// TODO: this needs to be changed - needs to be independent from dram alignment
const uint32_t alignment_bytes = std::max(hal.get_alignment(HalMemType::L1), hal.get_alignment(HalMemType::DRAM));

if (shard_layout == TensorMemoryLayout::WIDTH_SHARDED) {
uint32_t conv_output_c_per_core = per_core_out_matrix_width_ntiles * tt::constants::TILE_WIDTH;

Expand Down Expand Up @@ -1061,7 +1066,7 @@ conv_op_l1_usage conv2d::calculate_L1_usage(
} else if (output_dtype == DataType::FLOAT32) {
per_core_out_width_aligned *= 4;
}
output_size = round_up(per_core_out_width_aligned, 32) * pconfig.per_core_out_matrix_height;
output_size = round_up(per_core_out_width_aligned, alignment_bytes) * pconfig.per_core_out_matrix_height;
} else {
output_size = per_core_out_matrix_height_ntiles * per_core_out_matrix_width_ntiles * output_tile_size;
}
Expand Down Expand Up @@ -1164,7 +1169,7 @@ conv_op_l1_usage conv2d::calculate_L1_usage(
} else if (output_dtype == DataType::FLOAT32) {
per_core_out_width_aligned *= 4;
}
output_size = round_up(per_core_out_width_aligned, 32) * pconfig.per_core_out_matrix_height;
output_size = round_up(per_core_out_width_aligned, alignment_bytes) * pconfig.per_core_out_matrix_height;
} else {
output_size = per_core_out_matrix_height_ntiles * per_core_out_matrix_width_ntiles * output_tile_size;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -168,8 +168,6 @@ tt::tt_metal::operation::ProgramWithCallbacks multi_core_optimized_conv_width_sh
const auto& p_config = parallelization_config;
uint32_t num_cores_x = p_config.grid_size.x;
uint32_t num_cores_y = p_config.grid_size.y;
TT_FATAL(num_cores_x < 13, "Error");
TT_FATAL(num_cores_y < 10, "Error");
uint32_t per_core_out_matrix_height_ntiles =
div_up(p_config.per_core_out_matrix_height, tt::constants::TILE_HEIGHT);
uint32_t per_core_out_matrix_width_ntiles = div_up(p_config.per_core_out_matrix_width, tt::constants::TILE_WIDTH);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -170,8 +170,18 @@ operation::ProgramWithCallbacks untilize_with_halo_multi_core_v2(
bool const is_width_sharded = input_tensor.memory_config().memory_layout == TensorMemoryLayout::WIDTH_SHARDED;

auto aligned_input_nstick_nbytes = out_stick_nbytes;
if (out_stick_nbytes % input_tensor.buffer()->alignment() != 0) {
aligned_input_nstick_nbytes = tt::round_up(out_stick_nbytes, input_tensor.buffer()->alignment());
log_debug(tt::LogOp, "out_stick_nbytes = {}", out_stick_nbytes);
log_debug(tt::LogOp, "input_tensor.buffer()->alignment() = {}", input_tensor.buffer()->alignment());

uint32_t input_buffer_alignment = input_tensor.buffer()->alignment();
if (device->arch() == tt::ARCH::BLACKHOLE) {
// FIXME: Remove this workaround once the alignment is fixed in the allocator:
// https://github.com/tenstorrent/tt-metal/pull/13762, ticket:
// https://github.com/tenstorrent/tt-metal/issues/13609
input_buffer_alignment = 32; // this is a workaround for the issue mentioned above
}
if (out_stick_nbytes % input_buffer_alignment != 0) {
aligned_input_nstick_nbytes = tt::round_up(out_stick_nbytes, input_buffer_alignment);
}
// reader kernel
std::vector<uint32_t> reader_ct_args = {
Expand Down

0 comments on commit 25ee758

Please sign in to comment.