Removing skip_for_blackhole and fix some minor issues for blackhole c…

…onv2d (#17222) ### Issues #17221 #17216 ### Problem description Enable conv2d blackhole tests - to begin with, remove the skips in `tests/ttnn/unit_tests/operations/test_new_conv2d.py` and provide a report about state of tests. Fix minor bugs described in issues. ### What's changed - Removed `@skip_for_blackhole()` for tests that already run on wormhole_b0 - Removed skip for tests that needs 8x8 grid for blackhole - it caused OOM for 8x7 grid on wormhole_b0, it should not be case on blackhole - Removed some unused imports and functions from - Changed alignment of an output tensor in `calculate_L1_usage` function - Removed fatals for num_cores in width sharded factory - Added workaround for shallow convs (from [#17058](#17058), fyi @mywoodstock) - Changed act_block_w_div from 2 to 1 in some tests (this value is correlated with the number of input channels and grid size; 2 becomes invalid in these cases for blackhole due to the increased grid size) ### State of blackhole tests after this changes repro: `pytest tests/ttnn/unit_tests/operations/test_new_conv2d.py` PASSED: 2301 FAILED: 8 99.7% pass rate Of the tests that fail: 8 of them fail because of PCC error - #17226 ### Checklist - [x] Post commit CI passes - https://github.com/tenstorrent/tt-metal/actions/runs/13058074023 - [ ] Blackhole Post commit (if applicable) - [x] Model regression CI testing passes (if applicable) - https://github.com/tenstorrent/tt-metal/actions/runs/13070618785 - [ ] Device performance regression CI testing passes (if applicable) - [ ] **(For models and ops writers)** Full [new models](https://github.com/tenstorrent/tt-metal/actions/workflows/full-new-models-suite.yaml) tests passes - [ ] New/Existing tests provide coverage for changes
tenstorrent · Jan 31, 2025 · 25ee758 · 25ee758
1 parent 2d624db
commit 25ee758
Show file tree

Hide file tree

Showing 4 changed files with 23 additions and 45 deletions.
diff --git a/tests/ttnn/unit_tests/operations/test_new_conv2d.py b/tests/ttnn/unit_tests/operations/test_new_conv2d.py
@@ -7,38 +7,15 @@
 import torch
 import pytest
 from models.utility_functions import (
-    is_wormhole_b0,
     skip_for_grayskull,
     is_grayskull,
     is_wormhole_b0,
-    is_x2_harvested,
-    is_blackhole,
-    skip_for_blackhole,
     is_blackhole,
 )
 from tests.ttnn.utils_for_testing import assert_with_pcc, check_with_pcc, check_with_pcc_without_tensor_printout
 import ttnn
-
-
-def _nearest_32(x):
-    return math.ceil(x / 32) * 32
-
-
 from tests.ttnn.ttnn_utility_fuction import get_shard_grid_from_num_cores
 
-# def plot_diff(vals, fid, nsticks, stick_len):
-#     import matplotlib.pyplot as plt
-
-#     plt.clf()
-#     plt.figure(figsize=(100, 50))
-#     plt.xticks(torch.arange(0, stick_len) + 0.5, range(0, stick_len))
-#     plt.yticks(torch.arange(0, nsticks) + 0.5, range(0, nsticks))
-#     # plt.grid()
-#     bool_vals = vals > 0
-#     plt.imshow(bool_vals, interpolation="none", vmin=0, vmax=1, cmap="Blues")
-#     plt.savefig(f"diff_core_{fid}.png", bbox_inches="tight", pad_inches=0.1)
-#     plt.close()
-
 
 def run_conv(
     device,
@@ -445,7 +422,6 @@ def test_conv_features(
 
 
 @skip_for_grayskull()
-@skip_for_blackhole()
 @pytest.mark.parametrize("device_params", [{"l1_small_size": 2 * 16384}], indirect=True)
 @pytest.mark.parametrize("groups", [1, 2])
 @pytest.mark.parametrize("stride", [2])
@@ -536,7 +512,7 @@ def test_conv_features_multi_device(
         (2, 256, 2048, 9, 9, 3, 3, 1, 1, 1),
         (2, 512, 2048, 17, 17, 3, 3, 1, 1, 1),
         (2, 768, 768, 17, 17, 3, 3, 0, 0, 1),
-        (2, 1280, 2560, 15, 15, 3, 3, 1, 1, 2),
+        (2, 1280, 2560, 15, 15, 3, 3, 1, 1, 1),
         (2, 1280, 1280, 17, 17, 3, 3, 1, 1, 1),
         [1, 3024, 1232, 14, 14, 1, 1, 0, 0, 1],
         (2, 768, 32, 9, 9, 3, 3, 1, 1, 1),
@@ -578,8 +554,8 @@ def test_conv_ws(
     auto_shard,
     tilized_input,
 ):
-    if device.core_grid.y != 8:
-        pytest.skip("Needs 8x8 Grid")
+    if device.core_grid.y != 8 and is_wormhole_b0():
+        pytest.skip("Needs 8x8 grid for wormhole_b0")
 
     stride_h = stride
     stride_w = stride
@@ -833,9 +809,6 @@ def test_resnet50_conv_gs(
     config_override,
     auto_shard,
 ):
-    if is_blackhole():
-        pytest.skip("This test is for Grayskull only")
-
     if batch_size > 8 and (activations_dtype != ttnn.bfloat8_b or weights_dtype != ttnn.bfloat8_b):
         pytest.skip("Batch > 8 must be run fully bfp8")
     if batch_size == 20 and input_channels >= 128 and filter_width > 1:
@@ -879,7 +852,6 @@ def test_resnet50_conv_gs(
 
 
 @skip_for_grayskull()
-@skip_for_blackhole()
 @pytest.mark.parametrize("device_params", [{"l1_small_size": 16384}], indirect=True)
 @pytest.mark.parametrize(
     "batch_size, output_channels, input_channels, input_height, input_width, filter_height, filter_width, stride_h, stride_w, pad_h, pad_w, use_1d_systolic_array, config_override",
@@ -1017,7 +989,6 @@ def test_resnet50_conv_wh(
 
 
 @skip_for_grayskull()
-@skip_for_blackhole()
 @pytest.mark.parametrize("device_params", [{"l1_small_size": 16384}], indirect=True)
 @pytest.mark.parametrize(
     "batch_size, output_channels, input_channels, input_height, input_width, filter_height, filter_width, stride_h, stride_w, pad_h, pad_w, use_1d_systolic_array, config_override",
@@ -1336,7 +1307,6 @@ def test_sd_conv(
 
 
 @skip_for_grayskull()
-@skip_for_blackhole()
 @pytest.mark.parametrize("device_params", [{"l1_small_size": 16384}], indirect=True)
 @pytest.mark.parametrize(
     "batch_size, output_channels, input_channels, input_height, input_width, filter_height, filter_width, stride_h, stride_w, pad_h, pad_w, use_1d_systolic_array, config_override",
@@ -1607,7 +1577,6 @@ def test_unet_conv(
 
 
 @skip_for_grayskull()
-@skip_for_blackhole()
 @pytest.mark.parametrize("device_params", [{"l1_small_size": 16384}], indirect=True)
 @pytest.mark.parametrize(
     "batch_size, output_channels, input_channels, input_height, input_width, filter_height, filter_width, stride_h, stride_w, pad_h, pad_w, use_1d_systolic_array, config_override, use_shallow_conv_variant",
@@ -1700,7 +1669,6 @@ def test_unet_conv_wh(
 
 
 @skip_for_grayskull()
-@skip_for_blackhole()
 @pytest.mark.parametrize(
     "batch_size",
     [1],
@@ -1800,7 +1768,6 @@ def test_unet_conv_groups_2_wh(
 
 
 @skip_for_grayskull()
-@skip_for_blackhole()
 @pytest.mark.parametrize(
     "batch_size",
     [1],
@@ -1900,7 +1867,6 @@ def test_unet_conv_groups_4_6_wh(
 
 
 @skip_for_grayskull()
-@skip_for_blackhole()
 @pytest.mark.parametrize(
     "batch_size",
     [1],
@@ -2131,7 +2097,7 @@ def test_conv_core_nondivis(
         (768, 768, 16, 16, 1, ttnn.TensorMemoryLayout.WIDTH_SHARDED),
         (1280, 1280, 16, 16, 1, ttnn.TensorMemoryLayout.WIDTH_SHARDED),
         (1280, 1280, 8, 8, 1, ttnn.TensorMemoryLayout.WIDTH_SHARDED),
-        (1280, 2560, 8, 8, 2, ttnn.TensorMemoryLayout.WIDTH_SHARDED),
+        (1280, 2560, 8, 8, 1, ttnn.TensorMemoryLayout.WIDTH_SHARDED),
         (128, 128, 8, 8, 1, ttnn.TensorMemoryLayout.BLOCK_SHARDED),
         (128, 128, 16, 16, 1, ttnn.TensorMemoryLayout.BLOCK_SHARDED),
         (128, 128, 32, 32, 1, ttnn.TensorMemoryLayout.BLOCK_SHARDED),
@@ -2718,7 +2684,6 @@ def test_conv_for_vanilla_unet(
     )
 
 
-@skip_for_blackhole()
 @pytest.mark.parametrize("device_params", [{"l1_small_size": 16384}], indirect=True)
 @pytest.mark.parametrize(
     "batch_size, output_channels, input_channels, input_height, input_width, filter_height, filter_width, stride_h, stride_w, pad_h, pad_w, use_1d_systolic_array, config_override",

diff --git a/ttnn/cpp/ttnn/operations/conv/conv2d/conv2d_utils.cpp b/ttnn/cpp/ttnn/operations/conv/conv2d/conv2d_utils.cpp
@@ -3,12 +3,14 @@
 // SPDX-License-Identifier: Apache-2.0
 
 #include <sys/types.h>
+#include <algorithm>
 #include <cstdint>
 #include <optional>
 #include <tuple>
 
 #include "conv2d_utils.hpp"
 #include <tt-metalium/buffer_constants.hpp>
+#include "tt-metalium/hal.hpp"
 #include "ttnn/operations/conv/conv2d/device/conv2d_op.hpp"
 #include "ttnn/operations/conv/conv2d/prepare_conv2d_weights.hpp"
 #include "ttnn/operations/core/compute_kernel/compute_kernel_config.hpp"
@@ -983,6 +985,9 @@ conv_op_l1_usage conv2d::calculate_L1_usage(
         (per_core_out_matrix_height_ntiles + act_block_h_ntiles - 1) / act_block_h_ntiles;
     uint32_t out_block_h_ntiles_padded = num_blocks_act_h_per_core * act_block_h_ntiles;
 
+    // TODO: this needs to be changed - needs to be independent from dram alignment
+    const uint32_t alignment_bytes = std::max(hal.get_alignment(HalMemType::L1), hal.get_alignment(HalMemType::DRAM));
+
     if (shard_layout == TensorMemoryLayout::WIDTH_SHARDED) {
         uint32_t conv_output_c_per_core = per_core_out_matrix_width_ntiles * tt::constants::TILE_WIDTH;
 
@@ -1061,7 +1066,7 @@ conv_op_l1_usage conv2d::calculate_L1_usage(
             } else if (output_dtype == DataType::FLOAT32) {
                 per_core_out_width_aligned *= 4;
             }
-            output_size = round_up(per_core_out_width_aligned, 32) * pconfig.per_core_out_matrix_height;
+            output_size = round_up(per_core_out_width_aligned, alignment_bytes) * pconfig.per_core_out_matrix_height;
         } else {
             output_size = per_core_out_matrix_height_ntiles * per_core_out_matrix_width_ntiles * output_tile_size;
         }
@@ -1164,7 +1169,7 @@ conv_op_l1_usage conv2d::calculate_L1_usage(
             } else if (output_dtype == DataType::FLOAT32) {
                 per_core_out_width_aligned *= 4;
             }
-            output_size = round_up(per_core_out_width_aligned, 32) * pconfig.per_core_out_matrix_height;
+            output_size = round_up(per_core_out_width_aligned, alignment_bytes) * pconfig.per_core_out_matrix_height;
         } else {
             output_size = per_core_out_matrix_height_ntiles * per_core_out_matrix_width_ntiles * output_tile_size;
         }

diff --git a/ttnn/cpp/ttnn/operations/conv/conv2d/device/conv2d_op_width_sharded_program_factory.cpp b/ttnn/cpp/ttnn/operations/conv/conv2d/device/conv2d_op_width_sharded_program_factory.cpp
@@ -168,8 +168,6 @@ tt::tt_metal::operation::ProgramWithCallbacks multi_core_optimized_conv_width_sh
     const auto& p_config = parallelization_config;
     uint32_t num_cores_x = p_config.grid_size.x;
     uint32_t num_cores_y = p_config.grid_size.y;
-    TT_FATAL(num_cores_x < 13, "Error");
-    TT_FATAL(num_cores_y < 10, "Error");
     uint32_t per_core_out_matrix_height_ntiles =
         div_up(p_config.per_core_out_matrix_height, tt::constants::TILE_HEIGHT);
     uint32_t per_core_out_matrix_width_ntiles = div_up(p_config.per_core_out_matrix_width, tt::constants::TILE_WIDTH);

diff --git a/...ions/data_movement/untilize_with_halo_v2/device/untilize_with_halo_v2_program_factory.cpp b/...ions/data_movement/untilize_with_halo_v2/device/untilize_with_halo_v2_program_factory.cpp
@@ -170,8 +170,18 @@ operation::ProgramWithCallbacks untilize_with_halo_multi_core_v2(
     bool const is_width_sharded = input_tensor.memory_config().memory_layout == TensorMemoryLayout::WIDTH_SHARDED;
 
     auto aligned_input_nstick_nbytes = out_stick_nbytes;
-    if (out_stick_nbytes % input_tensor.buffer()->alignment() != 0) {
-        aligned_input_nstick_nbytes = tt::round_up(out_stick_nbytes, input_tensor.buffer()->alignment());
+    log_debug(tt::LogOp, "out_stick_nbytes = {}", out_stick_nbytes);
+    log_debug(tt::LogOp, "input_tensor.buffer()->alignment() = {}", input_tensor.buffer()->alignment());
+
+    uint32_t input_buffer_alignment = input_tensor.buffer()->alignment();
+    if (device->arch() == tt::ARCH::BLACKHOLE) {
+        // FIXME: Remove this workaround once the alignment is fixed in the allocator:
+        // https://github.com/tenstorrent/tt-metal/pull/13762, ticket:
+        // https://github.com/tenstorrent/tt-metal/issues/13609
+        input_buffer_alignment = 32;  // this is a workaround for the issue mentioned above
+    }
+    if (out_stick_nbytes % input_buffer_alignment != 0) {
+        aligned_input_nstick_nbytes = tt::round_up(out_stick_nbytes, input_buffer_alignment);
     }
     // reader kernel
     std::vector<uint32_t> reader_ct_args = {